### Wiki Pages (2017)
#### Source: https://www.kaggle.com/jkkphys/english-wikipedia-articles-20170820-sqlite?select=enwiki-20170820.db

In [None]:
import sqlite3
import pandas as pd
import gc
import re
from random import sample
from IPython.display import clear_output

In [None]:
# Connecting to wiki database
sql = sqlite3.connect("./Data/Wiki/DataBase.db")

In [None]:
# Declaring sql cursor
cur = sql.cursor()

In [None]:
# All tables in the DataBase:
for i in cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;"):
    print(i)
# Name of the table is ARTICLES

('ARTICLES',)


In [None]:
# Name of all columns of the table
data =  cur.execute("SELECT * FROM articles LIMIT 1")
data.description

(('ARTICLE_ID', None, None, None, None, None, None),
 ('TITLE', None, None, None, None, None, None),
 ('SECTION_TITLE', None, None, None, None, None, None),
 ('SECTION_TEXT', None, None, None, None, None, None))

In [None]:
# Total number of data entries: ~2.3 Crore
cur.execute("SELECT COUNT(Article_ID) FROM articles")
N = cur.fetchall()[0][0]
print(N)

23046187


In [None]:
# Extract all headings
headings = [heading[0] for heading in cur.execute("SELECT TITLE FROM articles")]

In [None]:
# Extract all sub-headings
sub_headings = [sub_heading[0] for sub_heading in cur.execute("SELECT SECTION_TITLE FROM articles")]

In [None]:
print('Number of Headings: ', len(set(headings)))
print('Number of Sub-Headings: ', len(set(sub_headings)))

Number of Headings:  4902648
Number of Sub-Headings:  1662480


In [None]:
# Limiting to 16,00,000 data entries (Not ordered)
N= 1600000
headings = sample(list(set(headings)),N)
sub_headings = sample(list(set(sub_headings)),N)

In [None]:
# Extract all sections with imposed limit of 16,00,000
gc.collect()
sections = [section[0] for section in cur.execute("SELECT SECTION_TEXT FROM articles LIMIT 1600000")]
sql.close()

In [None]:
# Saving as csv file (Unordered)
df = pd.DataFrame({'Headings':headings,'Sub_Headings': sub_headings, 'Section': sections})

In [None]:
del headings, sub_headings, sections

In [None]:
df.head()

Unnamed: 0,Headings,Sub_Headings,Section
0,"Floriana, Cairns",The Anchor brand Outside New Zealand,\n\n\n\n\n\n'''Anarchism''' is a political phi...
1,1989 Los Angeles Dodgers season,Defence List,\n\nThe term ''anarchism'' is a compound word ...
2,Guo Ruilong,"March 4, 1953 (Wednesday)",\n\n===Origins===\nWoodcut from a Diggers docu...
3,"Weber County, Utah",Guitar instructor,\nPortrait of philosopher Pierre-Joseph Proudh...
4,Scaly-breasted munia,Brief war with Theophilos,\nconsistent with anarchist values is a contro...


In [None]:
#removing  <ref> ...</ref> labels present in some of the sub_headings
import re
df.Sub_Headings = df.Sub_Headings.apply(lambda x: re.sub('\<ref\>(.*?)\<\/ref\>$','',x))

# Removing all /n's from Section and Special symbols
df.Section = df.Section.apply(lambda x: re.sub("[^0-9a-zA-Z]+",' ',re.sub("\\n",'',x)))

# Lowering Everything
df.Section = df.Section.apply(lambda x: ' '.join(([str(word).lower() for word in x.split()])))
df.Sub_Headings = df.Sub_Headings.apply(lambda x: ' '.join(([word.lower() for word in x.split()])))
df.Headings = df.Headings.apply(lambda x: ' '.join(([word.lower() for word in x.split()])))


In [None]:
# Fetching all Headings,Sub-Headings,Sections.
headings = list(df.Headings)
sub_headings = list(df.Sub_Headings)
sections = list(df.Section)

In [None]:
from json import dump

with open('./Data/Wiki/Headings.json','w',encoding="utf-8") as f:
    dump(headings,f)

with open('./Data/Wiki/Sub_Headings.json','w',encoding="utf-8") as f:
    dump(sub_headings,f)
    
with open('./Data/Wiki/Sections.json','w',encoding="utf-8") as f:
    dump(sections,f)

#### Summary: 16 lakh of unique heading, sub-heading and sections are collected without any order

### Drugs.com

In [None]:
from bs4 import BeautifulSoup
import requests
from IPython.display import clear_output

In [None]:
master_headings = set()
master_sub_headings = set()
master_sections = set()

In [None]:
parent_links = []
soup_links_parent = BeautifulSoup(requests.get('https://www.drugs.com/pro/').text,'lxml')
for links in soup_links_parent.find(class_ = 'column-split col-list-az').find_all('a',href=True):
    parent_links.append('https://www.drugs.com'+links['href'])

In [None]:
master_links = []
for link in parent_links:
    soup_links = BeautifulSoup(requests.get(link).text,'lxml')
    for links in soup_links.find(class_ = 'ddc-list-column-2').find_all('a',href=True):
        master_links.append('https://www.drugs.com'+links['href'])

In [None]:
len(master_links)

5016

In [None]:
err = 0
for i,link in enumerate(master_links):
    try:
        html = requests.get(link).text
    except:
        err+=1
        clear_output(wait=True)
        print(i/5016 * 100,' %')
        print('Links cleared', i)
        print('Links Skipped',err)
        print('Number of Sections Found',len(master_sections))
        print('Number of Unique Headings Found ',len(master_headings))
        print('Number of Unique sub_headings Found ',len(master_sub_headings))
        continue
        
    html = requests.get(link).text
    soup = BeautifulSoup(html,'lxml')
    
    for table in soup.find_all("table"):
        table.extract()
    
    head = []
    h2_tags = soup.find_all('h2')
    for tag in h2_tags:
        head.append(tag.text)

    head = set(head)

    sub_headings = []
    h3_tags = soup.find_all('h3')
    for tag in h3_tags:
        sub_headings.append(tag.text)

    span_tags = soup.find_all('span',class_ = 'Bold')
    for tags in span_tags:
        if len(tags.text.split())<5 and not tags.find(class_ = 'Table'):
            sub_headings.append(tags.text)

    sub_headings = set(sub_headings)

    section = []
    paras = soup.find_all('p',class_ = None)
    for para in paras:
        section.append(para.text)

    section = set(section)

    master_sections = master_sections.union(section)
    master_headings = master_headings.union(head)
    master_sub_headings = master_sub_headings.union(sub_headings)
    
    clear_output(wait=True)
    print(i/5016 * 100,' %')
    print('Links cleared', i)
    print('Links Skipped',err)
    print('Number of Sections Found',len(master_sections))
    print('Number of Unique Headings Found ',len(master_headings))
    print('Number of Unique sub_headings Found ',len(master_sub_headings))
    


99.98006379585327  %
Links cleared 5015
Links Skipped 3
Number of Sections Found 279398
Number of Unique Headings Found  24693
Number of Unique sub_headings Found  48341


In [None]:
master_sections = list(master_sections)
master_headings = list(master_headings)
master_sub_headings = list(master_sub_headings)

In [None]:
from json import dump
with open('./Data/drugs.com/Sections.json','w',encoding="utf-8") as f:
    dump(master_sections,f)


with open('./Data/drugs.com/Headings.json','w',encoding="utf-8") as f:
    dump(master_headings,f)

with open('./Data/drugs.com/Sub_Headings.json','w',encoding="utf-8") as f:
    dump(master_sub_headings,f)

## jmir.com (2015 to 2021)

In [None]:
from bs4 import BeautifulSoup
import requests
from IPython.display import clear_output

In [None]:
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

years = ['2015','2016','2017','2018','2019','2020','2021']
year_links = ['https://www.jmir.org/'+year for year in years]
# links = []
for year_link in year_links:
    
    browser = webdriver.Chrome()
    
    browser.get(year_link)
    elem = browser.find_element(By.TAG_NAME, "body")

    no_of_pagedowns = 1000

    i=0;
    while no_of_pagedowns:
        i+=1;
        if i<4:
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(1)
            no_of_pagedowns-=1
        else:
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(3)
            no_of_pagedowns-=1
            i=0
        clear_output(wait=True)
        print(year_link)
        print('Number of links: ',len(links))
        print('Number of Page Downs Completed: ',(1000-no_of_pagedowns))
        print('Percentage completed:', (1000-no_of_pagedowns)/1000 * 100,' %')

    post_elems = browser.find_elements(By.CLASS_NAME, "title-link")
    links.extend([elem.get_attribute('href') for elem in post_elems])
    browser.close()

https://www.jmir.org/2021
Number of links:  1476
Number of Page Downs Completed:  1000
Percentage completed: 100.0  %


In [None]:
err=0
for i,link in enumerate(links):
    try:
        soup = BeautifulSoup(requests.get(link).text,'lxml')
    except:
        err+=1
        continue

    soup = BeautifulSoup(requests.get(link).text,'lxml')
    for table in soup.find_all("figure-table"):
        table.extract()

    for fig in soup.find_all('figure'):
        fig.extract()

    art = soup.find(class_ = 'article-content clearfix')

    master_heading = []
    for h3 in art.find_all('h3'):
        master_heading.append(h3.text)

    master_sub_heading = []
    for h4 in art.find_all('h4'):
        master_sub_heading.append(h4.text)
    for h5 in art.find_all('h5'):
        master_sub_heading.append(h5.text)
    master_heading = set(master_heading)
    master_sub_heading = set(master_sub_heading)

    section = []
    for p in soup.find(class_ = 'main-article clearfix').find_all('p',class_ = 'abstract-paragraph'):
        section.append(p.text)
    
    clear_output(wait=True)
    print(i/n * 100,' %')
    print('Links cleared', i)
    print('Links Skipped',err)
    print('Number of Sections Found',len(master_sections))
    print('Number of Unique Headings Found ',len(master_headings))
    print('Number of Unique sub_headings Found ',len(master_sub_headings))

99.96331621423332  %
Links cleared 2725
Links Skipped 0
Number of Sections Found 158573
Number of Unique Headings Found  675
Number of Unique sub_headings Found  32203


In [None]:
master_sections = list(master_sections)
master_headings = list(master_headings)
master_sub_headings = list(master_sub_headings)

In [None]:
from json import dump
with open('./Data/jmir.org_15_21/Sections.json','w',encoding="utf-8") as f:
    dump(master_sections,f)


with open('./Data/jmir.org_15_21/Headings.json','w',encoding="utf-8") as f:
    dump(master_headings,f)

with open('./Data/jmir.org_15_21/Sub_Headings.json','w',encoding="utf-8") as f:
    dump(master_sub_headings,f)