In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
driver = webdriver.Chrome()

In [3]:
global_details = dict()
synonyms_dict = dict()
keywords_done = set()
language_ids = {"Hindi" : 0,
                "English" : 1,
                "Bengali": 3,
                "Gujarati" : 5,
                "Marathi" : 11,
                "Odiya" : 18,
                "Punjabi" : 16}

gloss_snapshot_fname = "snapshots/gloss_examples_SNAPSHOT"
synset_snapshot_fname = "snapshots/synset_SNAPSHOT"
gloss_final_fname = "gloss_final"
synset_final_fname = "synset_final"
SNAPSHOT_INTERVAL = 200
WAIT_THRESHOLD = 10

In [4]:
'''file_path = "./iwndata/punjabi.syns"
file = open(file_path, "r", encoding='utf-8')
keywords = list()
for line in tqdm(file.readlines()):
    if line.startswith("SYN"):
        _, kws = line.split("::")
        kws = [x.strip() for x in kws.strip().split(",")]
        if len(kws) > 0:
            keywords.extend(kws)
'''
file_path = "keywords.txt"
keywords = list()
with open(file_path, "r", encoding='utf-8') as f:
    for x in f.readlines():
        keywords.append(x.strip())

In [5]:
def fetch_details(url):
    try:
        driver.get(url)
    except:
        return
    try:
        details = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'detail'))
        )
    except:
        return
    try:
        time.sleep(1)
        soup = BeautifulSoup(details.get_attribute("innerHTML"), 'html.parser')
        result = soup.find("td", {"class" : "label1"})
        curr_details = dict()
        data = list()
        for x in result.find_all("span"):
            data.append(x.text)
        result = soup.find("table", {"class" : "abc"})

        synset_id = result.find("label", {"id" : "sid"}).text.strip()
        synset_id = int(synset_id.strip())

        if synset_id in synonyms_dict:
            return

        part_of_speech = result.find("label", {"id" : "pos"}).text.strip()
        curr_details['POS'] = part_of_speech

        elem = driver.find_elements_by_class_name("semi_bar")
        buttons = elem[2].find_elements_by_class_name("title2")
        synonyms_data = dict()

        for lang in language_ids:
            buttons[language_ids[lang]].click()
            '''
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, 'td'))
                )
            except:
                pass    
             '''
            n_wait = 0
            key = lang+"_"
            while n_wait < WAIT_THRESHOLD:
                rel_content = driver.find_elements_by_id('rel_content')
                html_content = rel_content[0].get_attribute("innerHTML")
                if html_content != "wait...":
                    break
                time.sleep(1)
                n_wait += 1
            if n_wait == WAIT_THRESHOLD:
                curr_details[key+"gloss"] = ""
                curr_details[key+"example"] = ""
                synonyms_data[lang] = list()
                continue
            mid_area_soup = BeautifulSoup(html_content, 'html.parser')


            synonyms = list()
            for syns in mid_area_soup.find_all("a"):
                #synonyms.append((syns.text.strip(), syns["href"])) #uncomment for adding hrefs as well.
                syn = syns.text.strip()
                if lang == "Punjabi":
                    keywords_done.add(syn)
                synonyms.append(syn)
            synonyms_data[lang] = synonyms
            contents = mid_area_soup.find_all("td", {"class" : "content"})

            
            gloss = ""
            try:
                gloss = contents[-2].text.strip() #Gloss
            except:
                pass
            curr_details[key+"gloss"] = gloss

            example = ""
            try:
                example = contents[-1].text.strip()
            except:
                pass
            curr_details[key+"example"] = example.strip('"')
    except:
        return
    global_details[synset_id] = curr_details
    synonyms_dict[synset_id] = synonyms_data

In [6]:
def generate_snapshot(cnt):
    with open(gloss_snapshot_fname+str(cnt)+".json", "w") as f:
        json.dump(global_details, f)
    with open(synset_snapshot_fname+str(cnt)+".json", "w") as f:
        json.dump(synonyms_dict, f)
    print("Snapshot Generated for :", cnt)

In [7]:
def generate_final_files():
    with open(gloss_final_fname+".json", "w") as f:
        json.dump(global_details, f)
    with open(synset_final_fname+".json", "w") as f:
        json.dump(synonyms_dict, f)
    print("Final Dump Generated")

In [8]:
global_details.clear()
synonyms_dict.clear()
keywords_done.clear()

In [10]:
file_path = "global_details_pause.json"
file = open(file_path, "r", encoding='utf-8')
with open(file_path, "r") as f:
        global_details=json.load(f)
len(global_details)

2252

In [11]:
file_path = "synonyms_dict_pause.json"
file = open(file_path, "r", encoding='utf-8')
with open(file_path, "r") as f:
        synonyms_dict=json.load(f)
synonyms_dict

{'15005': {'Hindi': ['अजपति'],
  'English': [],
  'Bengali': ['অজপতি'],
  'Gujarati': ['અજપતિ'],
  'Marathi': [],
  'Odiya': ['ଅଜପତି'],
  'Punjabi': ['ਅਜਪਤੀ']},
 '22672': {'Hindi': ['रंगसाज', 'रंगसाज़', 'रङ्गसाज', 'रङ्गसाज़', 'रँगिया'],
  'English': [],
  'Bengali': ['রঙশিল্পী'],
  'Gujarati': ['રંગસાઝ', 'રંગારો', 'રંગરેજ'],
  'Marathi': [],
  'Odiya': ['ରଙ୍ଗାଜୀବ'],
  'Punjabi': ['ਰੰਗਸਾਜ਼', 'ਰੰਗਸਾਜ']},
 '19471': {'Hindi': ['एंडोरा', 'ऐंडोरा'],
  'English': ['Andorra', 'Principality_of_Andorra'],
  'Bengali': ['এন্ডোরা'],
  'Gujarati': ['એંડોરા'],
  'Marathi': ['अँडोरा'],
  'Odiya': ['ଏଣ୍ଡୋରା', 'ଏଂଡୋରା'],
  'Punjabi': ['ਅੰਡੋਰਾ', 'ਏਂਡੋਰਾ']},
 '24249': {'Hindi': ['ज़ारीना', 'जारीना'],
  'English': ['czarina', 'tsarina', 'tzarina', 'czaritza', 'tsaritsa'],
  'Bengali': ['জারিনা'],
  'Gujarati': ['જારન'],
  'Marathi': ['जारीना'],
  'Odiya': ['ଜାରୀନା'],
  'Punjabi': ['ਜ਼ਾਰੀਨਾ', 'ਜਾਰੀਨਾ']},
 '24120': {'Hindi': ['गैबोरोन', 'गबोरोन'],
  'English': ['Gaborone', 'capital_of_Botswana'],
  'Bengali':

In [12]:
import pickle
with open("keywords_done_pause.json", "rb") as f:
        keywords_done=pickle.load(f)
len(keywords_done)

6881

In [13]:
last_done = 0
for kw in tqdm(keywords):
    #print("Fetching for keyword : ", kw)
    if kw in keywords_done:
        continue
    kw=kw.replace(" ","")
    query_url = r"http://tdil-dc.in/indowordnet/first?langno=16&queryword="+kw
    fetch_details(query_url)
    if len(global_details) % SNAPSHOT_INTERVAL == 0 and len(global_details) != last_done:
        generate_snapshot(len(global_details))
        last_done = len(global_details)
        
generate_final_files()

  5%|▍         | 2529/52797 [22:34<97:50:42,  7.01s/it] 

Snapshot Generated for : 2400


  5%|▌         | 2748/52797 [52:47<125:34:17,  9.03s/it]

Snapshot Generated for : 2600


  6%|▌         | 2968/52797 [1:23:00<116:42:48,  8.43s/it]

Snapshot Generated for : 2800


  6%|▌         | 3189/52797 [1:53:22<117:35:46,  8.53s/it]

Snapshot Generated for : 3000


  6%|▋         | 3405/52797 [2:23:32<124:47:16,  9.10s/it]

Snapshot Generated for : 3200


  7%|▋         | 3644/52797 [2:53:45<99:56:27,  7.32s/it] 

Snapshot Generated for : 3400


  7%|▋         | 3876/52797 [3:24:07<122:11:56,  8.99s/it]

Snapshot Generated for : 3600


  8%|▊         | 4117/52797 [3:54:18<117:28:08,  8.69s/it]

Snapshot Generated for : 3800


  8%|▊         | 4360/52797 [4:24:30<106:54:30,  7.95s/it]

Snapshot Generated for : 4000


  9%|▊         | 4592/52797 [4:54:50<121:41:07,  9.09s/it]

Snapshot Generated for : 4200


  9%|▉         | 4824/52797 [5:25:11<120:52:00,  9.07s/it]

Snapshot Generated for : 4400


 10%|▉         | 5059/52797 [5:55:22<101:11:16,  7.63s/it]

Snapshot Generated for : 4600


 10%|█         | 5305/52797 [6:25:45<89:54:30,  6.82s/it] 

Snapshot Generated for : 4800


 11%|█         | 5549/52797 [6:56:20<115:31:11,  8.80s/it]

Snapshot Generated for : 5000


 11%|█         | 5805/52797 [7:26:29<112:11:43,  8.60s/it]

Snapshot Generated for : 5200


 11%|█▏        | 6062/52797 [8:03:17<114:42:04,  8.84s/it]

Snapshot Generated for : 5400


 12%|█▏        | 6314/52797 [8:33:26<88:04:25,  6.82s/it] 

Snapshot Generated for : 5600


 12%|█▏        | 6561/52797 [9:03:37<108:00:24,  8.41s/it]

Snapshot Generated for : 5800


 13%|█▎        | 6825/52797 [9:35:06<99:57:07,  7.83s/it] 

Snapshot Generated for : 6000


 13%|█▎        | 7072/52797 [10:05:40<110:07:29,  8.67s/it]

Snapshot Generated for : 6200


 14%|█▍        | 7333/52797 [10:36:25<77:42:47,  6.15s/it] 

Snapshot Generated for : 6400


 14%|█▍        | 7598/52797 [11:07:47<108:00:47,  8.60s/it]

Snapshot Generated for : 6600


 15%|█▍        | 7857/52797 [11:39:41<108:08:38,  8.66s/it]

Snapshot Generated for : 6800


 15%|█▌        | 8118/52797 [12:10:34<110:33:11,  8.91s/it]

Snapshot Generated for : 7000


 16%|█▌        | 8383/52797 [12:41:05<88:47:50,  7.20s/it] 

Snapshot Generated for : 7200


100%|██████████| 52797/52797 [12:59:48<00:00,  1.13it/s]   


Final Dump Generated


In [None]:
keywords[70]

In [17]:
len(global_details)

2252

In [23]:
with open("global_details_pause.json", "w") as f:
        json.dump(global_details, f)
with open("synonyms_dict_pause.json", "w") as f:
        json.dump(synonyms_dict, f)
with open("keywords_done_pause.json", "wb") as f:
        pickle.dump(keywords_done, f)