# Following code saves the JFM text, equations and Figures in a folder named the article title

In [1]:
import requests
from bs4 import BeautifulSoup as BS
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
import os
import json

In [2]:
# download the images
def download_image(url, file_path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(file_path, " :success")
    else:
        print(file_path, " :FAIL", response.status_code)

def extract_images(soup,folder):
    fig_url=[]
    if not os.path.exists(folder+'/figures'):
        os.makedirs(folder+'/figures')
    for i,txt in enumerate(soup.find_all('div', class_='fig fig')):
        fig_url.append(txt.find_all('img', class_='aop-lazy-load-image')[0]['data-src'])

    for i,url in enumerate(fig_url):
#         print(url)
        file_path = folder+'/figures/'+"Figure_"+str(i+1)+".png"
        download_image(url, file_path)
        
        
def save_image_captions(soup,folder_path):
    fig_caption_dict={}
    for txt in soup.find_all('div', class_='fig fig'):
        key=txt.find('span', class_='label').text
        data=txt.text.strip()
        fig_caption_dict[key]=data

    file_path=folder_path+'/'+'figure_captions.json'
    with open(file_path, 'w') as file:
        json.dump(fig_caption_dict, file)
    print(len(fig_caption_dict)," Figure captions are saved")
    return fig_caption_dict
    


In [3]:
JFM_URL="https://www.cambridge.org/core/journals/journal-of-fluid-mechanics/article/abs/shockinduced-aerobreakup-of-a-polymeric-droplet/49CC03638E279240C7AF44AD425BF005"

In [4]:
webpage=requests.get(JFM_URL).text
soup=BS(webpage,'lxml')

In [5]:
folder_path='./JFM_webscrap/'+soup.find_all('h1')[0].text
print(folder_path)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet


In [6]:
extract_images(soup,folder_path)

./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_1.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_2.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_3.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_4.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_5.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_6.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_7.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_8.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_9.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a polymeric droplet/figures/Figure_10.png  :success
./JFM_webscrap/Shock-induced aerobreakup of a pol

In [7]:
fig_caption_dict=save_image_captions(soup,folder_path)

13  Figure captions are saved


In [8]:
for i,txt in enumerate(soup.find_all('div', class_='table-wrap')):
    print(txt.find('span',class_="label").text)
    print(txt.find('div',class_="caption").text)

Table 1.
Table 1. Properties of the test liquids.
Table 2.
Table 2. Calculated peak value of gas-phase fluid properties at the droplet location.


In [9]:
def save_equations(soup,folder_path):
    equations_dict={}
    for txt in soup.find_all('div', class_='disp-formula'):
        key=txt.find('span', class_='label').text
        data=txt.text.strip()
        equations_dict[key]=data

    file_path=folder_path+'/'+'equations.json'
    with open(file_path, 'w') as file:
        json.dump(equations_dict, file)
    print(len(equations_dict)," equations are save")


In [10]:
save_equations(soup,folder_path)

20  equations are save


In [11]:
def save_reference(soup,folder_path):
    ids=[]
    citation=[]
    DOI=[]
    GS_link=[]
    year=[]
    title=[]
    source=[]
    vol=[]
    issue=[]
    lpage=[]
    fpage=[]
    authors=[]

    for txt in soup.find_all('div', class_='circle-list__item'):
        try:
            ids.append(txt['id'])
        except:
            ids.append(None)
        try:
            citation.append(txt.text.replace("CrossRefGoogle Scholar", "").strip())
        except:
            citation.append(None)
        try:
            DOI.append(txt.find_all('a', class_='ref-link')[0]['href'])
        except:
            DOI.append(None)
        try:
            GS_link.append(txt.find_all('a', class_='ref-link')[1]['href'])
        except:
            GS_link.append(None)
        try:
            year.append(txt.find_all('span', class_='year')[0].text)
        except:
            year.append(None)
        try:
            title.append(txt.find_all('span', class_='article-title')[0].text)
        except:
            title.append(None)
        try:
            source.append(txt.find_all('span', class_='source')[0].text)
        except:
            source.append(None)
        try:
            vol.append(txt.find_all('span', class_='volume')[0].text)
        except:
            vol.append(None)
        try:
            issue.append(txt.find_all('span', class_='issue')[0].text)
        except:
            issue.append(None)
        try:
            fpage.append(txt.find_all('span', class_='fpage')[0].text)
        except:
            fpage.append(None)
        try:
            lpage.append(txt.find_all('span', class_='lpage')[0].text)
        except:
            lpage.append(None)
        try:
            aut={}
            for i,name in enumerate(txt.find_all('span', class_='string-name')):
                surname=name.find('span',class_='surname').text
                fname=name.find('span',class_='given-names').text
                aut[i]={'name':fname,'surname':surname}
                authors.append(aut)
        except:
            authors.append(None)

    data = list(zip(ids, citation, authors, year,
                    title,source,vol,issue,
                    fpage,lpage,DOI,GS_link))
    df = pd.DataFrame(data, columns=['ids', 'citation', 'authors', 'year',
                                     'title','source','volume','issue',
                                     'fpage','lpage','DOI','GS_link'])

    file_path=folder_path+'/'+'references.csv'
    df.to_csv(file_path, index=False)
    print("references are saved in csv file")
        



In [12]:
save_reference(soup,folder_path)

references are saved in csv file


In [13]:
for txt in soup.find_all('p', class_='p'):
    print(txt.text.strip(),'\n')

Secondary atomization is the process of breaking a liquid droplet into smaller units. Aerobreakup is one example of secondary atomization in which a liquid droplet is exposed to a high-speed stream of gas (generally air), causing its fragmentation. Aerobreakup applies in various natural and industrial processes. Mixing of air and fuel droplets inside an internal combustion engine, gelled propellants in a rocket engine (Padwal, Natan & Mishra Reference Padwal, Natan and Mishra2021), breakup of sneezed salivary droplets (Scharfman et al. Reference Scharfman, Techet, Bush and Bourouiba2016; Sharma et al. Reference Sharma, Pinto, Saha, Chaudhuri and Basu2021a), falling raindrops (Villermaux & Bossa Reference Villermaux and Bossa2009) and powder production by spray atomization of fruit pulps (Cervantes-Martínez et al. Reference Cervantes-Martínez, Medina-Torres, González-Laredo, Calderas, Sánchez-Olivares, Herrera-Valencia, Infante, Rocha-Guzman and Rodriguez-Ramirez2014) are few instances 

In [14]:
eq_count=1
for txt in soup.find_all('p', class_='p'):
    try:
        txt['class'][1]=='continuation'
        print('<<equation present here>>: equation no:',eq_count,'\n')
        eq_count+=1
    except: 
        pass
#     print(txt.find('span',class__='label'))
    print(txt.text.strip(),'\n')

Secondary atomization is the process of breaking a liquid droplet into smaller units. Aerobreakup is one example of secondary atomization in which a liquid droplet is exposed to a high-speed stream of gas (generally air), causing its fragmentation. Aerobreakup applies in various natural and industrial processes. Mixing of air and fuel droplets inside an internal combustion engine, gelled propellants in a rocket engine (Padwal, Natan & Mishra Reference Padwal, Natan and Mishra2021), breakup of sneezed salivary droplets (Scharfman et al. Reference Scharfman, Techet, Bush and Bourouiba2016; Sharma et al. Reference Sharma, Pinto, Saha, Chaudhuri and Basu2021a), falling raindrops (Villermaux & Bossa Reference Villermaux and Bossa2009) and powder production by spray atomization of fruit pulps (Cervantes-Martínez et al. Reference Cervantes-Martínez, Medina-Torres, González-Laredo, Calderas, Sánchez-Olivares, Herrera-Valencia, Infante, Rocha-Guzman and Rodriguez-Ramirez2014) are few instances 

In [15]:
p_text=[]
eq_count=1
for txt in soup.find_all('p', class_='p'):
    try:
        txt['class'][1]=='continuation'
        p_text.append('<<equation present here>>: equation no:'+str(eq_count))
        eq_count+=1
    except: 
        pass
    t=txt.text.strip()
    p_text.append(t)
    
# print(p_text)   

In [16]:
p_text

['Secondary atomization is the process of breaking a liquid droplet into smaller units. Aerobreakup is one example of secondary atomization in which a liquid droplet is exposed to a high-speed stream of gas (generally air), causing its fragmentation. Aerobreakup applies in various natural and industrial processes. Mixing of air and fuel droplets inside an internal combustion engine, gelled propellants in a rocket engine (Padwal, Natan & Mishra Reference Padwal, Natan and Mishra2021), breakup of sneezed salivary droplets (Scharfman et\xa0al. Reference Scharfman, Techet, Bush and Bourouiba2016; Sharma et\xa0al. Reference Sharma, Pinto, Saha, Chaudhuri and Basu2021a), falling raindrops (Villermaux & Bossa Reference Villermaux and Bossa2009) and powder production by spray atomization of fruit pulps (Cervantes-Martínez et\xa0al. Reference Cervantes-Martínez, Medina-Torres, González-Laredo, Calderas, Sánchez-Olivares, Herrera-Valencia, Infante, Rocha-Guzman and Rodriguez-Ramirez2014) are few

In [22]:
# new_text=[]
# for text in p_text:
#     for key,word in para_2_ref.items():
#         text=text.replace(word.strip(),key)
#     new_text.append(text)
# # for txt in new_text:
#     print(txt,'\n')

In [18]:
new_text2=[]
for text in new_text:
    for key,word in fig_caption_dict.items():
        text=text.replace(word.strip(),'')
    new_text2.append(text)
# for txt in new_text2:
#     print(txt,'\n')

file_path=folder_path+'/'+'text.txt'
with open(file_path, 'w') as file:
    for txt in new_text2:
        file.write(txt)
        file.write('\n')
        
#         print(txt,'\n')

In [19]:
def save_reference_keys(soup,folder_path):
    para_2_ref={}
    for txt in soup.find_all('p', class_='p'):
        for t in txt.find_all('a',class_="xref bibr"):
            para_2_ref[t['href']]=t.text

    file_path=folder_path+'/'+'reference_keys.json'
    with open(file_path, 'w') as file:
        json.dump(para_2_ref, file)
    print(len(para_2_ref)," reference keys are saved")
    return para_2_ref

In [20]:
para_2_ref=save_reference_keys(soup,folder_path)

63  reference keys are saved


In [21]:
para_2_ref

{'#ref34': 'Reference Padwal, Natan and Mishra2021',
 '#ref39': 'Reference Scharfman, Techet, Bush and Bourouiba2016',
 '#ref43': 'Reference Sharma, Pinto, Saha, Chaudhuri and Basu2021a',
 '#ref57': 'Reference Villermaux and Bossa2009',
 '#ref6': 'Reference Cervantes-Martínez, Medina-Torres, González-Laredo, Calderas, Sánchez-Olivares, Herrera-Valencia, Infante, Rocha-Guzman and Rodriguez-Ramirez2014',
 '#ref35': 'Reference Pilch and Erdman1987',
 '#ref11': 'Reference Gelfand1996',
 '#ref13': 'Reference Guildenbecher, López-Rivera and Sojka2009',
 '#ref17': 'Reference Jackiw and Ashgriz2021',
 '#ref46': 'Reference Sharma, Singh, Rao, Kumar and Basu2021c',
 '#ref42': 'Reference Sharma, Chandra, Basu and Kumar2022',
 '#ref18': 'Reference Jain, Prakash, Tomar and Ravikrishna2015',
 '#ref7': 'Reference Chen, Wagner, Farias, DeMauro and Guildenbecher2018',
 '#ref44': 'Reference Sharma, Rao, Chandra, Kumar, Basu and Tropea2023',
 '#ref59': 'Reference Wilcox, June, Brown and Kelley1961',
 '#r