In [2]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
driver = webdriver.Chrome()

In [4]:
global_details = dict()
synonyms_dict = dict()
keywords_done = set()
language_ids = {"Hindi" : 0,
                "English" : 1,
                "Bengali": 3,
                "Gujarati" : 5,
                "Marathi" : 11,
                "Odiya" : 18,
                "Punjabi" : 16}

gloss_snapshot_fname = "gloss_examples_SNAPSHOT"
synset_snapshot_fname = "synset_SNAPSHOT"
gloss_final_fname = "gloss_final"
synset_final_fname = "synset_final"
SNAPSHOT_INTERVAL = 3
WAIT_THRESHOLD = 10

In [5]:
'''file_path = "./iwndata/punjabi.syns"
file = open(file_path, "r", encoding='utf-8')
keywords = list()
for line in tqdm(file.readlines()):
    if line.startswith("SYN"):
        _, kws = line.split("::")
        kws = [x.strip() for x in kws.strip().split(",")]
        if len(kws) > 0:
            keywords.extend(kws)
'''
file_path = "keywords.txt"
keywords = list()
with open(file_path, "r", encoding='utf-8') as f:
    for x in f.readlines():
        keywords.append(x.strip())

In [6]:
def fetch_details(url):
    try:
        driver.get(url)
    except:
        return
    try:
        details = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'detail'))
        )
    except:
        return
    try:
        time.sleep(1)
        soup = BeautifulSoup(details.get_attribute("innerHTML"), 'html.parser')
        result = soup.find("td", {"class" : "label1"})
        curr_details = dict()
        data = list()
        for x in result.find_all("span"):
            data.append(x.text)
        result = soup.find("table", {"class" : "abc"})

        synset_id = result.find("label", {"id" : "sid"}).text.strip()
        synset_id = int(synset_id.strip())

        if synset_id in synonyms_dict:
            return

        part_of_speech = result.find("label", {"id" : "pos"}).text.strip()
        curr_details['POS'] = part_of_speech

        elem = driver.find_elements_by_class_name("semi_bar")
        buttons = elem[2].find_elements_by_class_name("title2")
        synonyms_data = dict()

        for lang in language_ids:
            buttons[language_ids[lang]].click()
            '''
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, 'td'))
                )
            except:
                pass    
             '''
            n_wait = 0
            key = lang+"_"
            while n_wait < WAIT_THRESHOLD:
                rel_content = driver.find_elements_by_id('rel_content')
                html_content = rel_content[0].get_attribute("innerHTML")
                if html_content != "wait...":
                    break
                time.sleep(1)
                n_wait += 1
            if n_wait == WAIT_THRESHOLD:
                curr_details[key+"gloss"] = ""
                curr_details[key+"example"] = ""
                synonyms_data[lang] = list()
                continue
            mid_area_soup = BeautifulSoup(html_content, 'html.parser')


            synonyms = list()
            for syns in mid_area_soup.find_all("a"):
                #synonyms.append((syns.text.strip(), syns["href"])) #uncomment for adding hrefs as well.
                syn = syns.text.strip()
                if lang == "Punjabi":
                    keywords_done.add(syn)
                synonyms.append(syn)
            synonyms_data[lang] = synonyms
            contents = mid_area_soup.find_all("td", {"class" : "content"})

            
            gloss = ""
            try:
                gloss = contents[-2].text.strip() #Gloss
            except:
                pass
            curr_details[key+"gloss"] = gloss

            example = ""
            try:
                example = contents[-1].text.strip()
            except:
                pass
            curr_details[key+"example"] = example.strip('"')
    except:
        return
    global_details[synset_id] = curr_details
    synonyms_dict[synset_id] = synonyms_data

In [7]:
def generate_snapshot(cnt):
    with open(gloss_snapshot_fname+str(cnt)+".json", "w") as f:
        json.dump(global_details, f)
    with open(synset_snapshot_fname+str(cnt)+".json", "w") as f:
        json.dump(synonyms_dict, f)
    print("Snapshot Generated for :", cnt)

In [8]:
def generate_final_files():
    with open(gloss_final_fname+".json", "w") as f:
        json.dump(global_details, f)
    with open(synset_final_fname+".json", "w") as f:
        json.dump(synonyms_dict, f)
    print("Final Dump Generated")

In [9]:
global_details.clear()
synonyms_dict.clear()
keywords_done.clear()

In [10]:
last_done = 0
for kw in tqdm(keywords):
    #print("Fetching for keyword : ", kw)
    if kw in keywords_done:
        continue
    query_url = r"http://tdil-dc.in/indowordnet/first?langno=16&queryword="+kw
    fetch_details(query_url)
    if len(global_details) % SNAPSHOT_INTERVAL == 0 and len(global_details) != last_done:
        generate_snapshot(len(global_details))
        last_done = len(global_details)
        
generate_final_files()

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.13s/it]

Final Dump Generated





In [68]:
keywords[70]

'ਵਜਵਾਇਆ'

In [12]:
global_details

{19471: {'POS': 'NOUN',
  'Hindi_gloss': 'स्पेन तथा फ्रांस के बीच स्थित एक देश',
  'Hindi_example': 'एंडोरा की राजधानी एंडोरा ल विला है ।',
  'English_gloss': 'a small republic in the eastern Pyrenees between Spain and France',
  'English_example': '',
  'Bengali_gloss': 'স্পেন এবং ফ্রান্সের মধ্যবর্তী দেশ',
  'Bengali_example': 'এন্ডোরার রাজধানী এন্ডোরা লা ভেলা',
  'Gujarati_gloss': 'સ્પેન તથા ફ્રાંસની વચ્ચે આવેલો એક દેશ',
  'Gujarati_example': 'એંડોરાની રાજધાની એંડોરા લ વિલા છે.',
  'Marathi_gloss': 'स्पेन व फ्रांस ह्यांमधील एक युरोपीय देश',
  'Marathi_example': 'अँडोरा ला व्हेल्या ही अँडोराची राजधानी आहे.',
  'Odiya_gloss': 'ସ୍ପେନ ତଥା ଫ୍ରାନ୍ସର ମଧ୍ୟରେ ସ୍ଥିତ ଏକ ଦେଶ',
  'Odiya_example': 'ଏଣ୍ଡୋରାର ରାଜଧାନୀ ହେଉଛି ଏଣ୍ଡୋରା ଲ ବିଲା',
  'Punjabi_gloss': 'ਸਪੇਨ ਅਤੇ ਫਰਾਂਸ ਦੇ ਵਿਚ ਸਥਿਤ ਇਕ ਦੇਸ਼',
  'Punjabi_example': 'ਅੰਡੋਰਾ ਦੀ ਰਾਜਧਾਨੀ ਅੰਡੋਰਾ ਲਾ ਵਿਲਾ ਹੈ'}}

In [11]:
synonyms_dict

{19471: {'Hindi': ['एंडोरा', 'ऐंडोरा'],
  'English': ['Andorra', 'Principality_of_Andorra'],
  'Bengali': ['এন্ডোরা'],
  'Gujarati': ['એંડોરા'],
  'Marathi': ['अँडोरा'],
  'Odiya': ['ଏଣ୍ଡୋରା', 'ଏଂଡୋରା'],
  'Punjabi': ['ਅੰਡੋਰਾ', 'ਏਂਡੋਰਾ']}}