## Import libraries

In [None]:
import os
import re
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

import pandas as pd
from math import isnan
import requests

## Define functions and global variable

### Variables

In [None]:
# Global variable
cambridge_url = 'https://dictionary.cambridge.org'
oxford_url = 'https://www.oxfordlearnersdictionaries.com'
header = ['Vocabulary', 'Type', 'Cloze', 'Phonetic', 'Audio', 'English meaning', 'Vietnamese meaning', 'Example']
['Type', 'Cloze', 'Phonetic', 'Audio', 'English meaning', 'Example']

### Selenium Driver function

In [None]:
# Initialize chrome driver
def initialize_driver(): 
    chrome_options = webdriver.ChromeOptions()
    download_path = r'/Users/btp712/Code/Anki crawler/audio/'
    prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096,
           "download.default_directory": download_path}
    chrome_options.add_experimental_option("prefs", prefs) # Manage image loading and run on disk cache
    # chrome_options.add_argument("--headless") # Runs Chrome in headless mode
    chrome_options.add_argument('--no-sandbox') # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage') # overcome limited resource problems
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

    return driver

### Support function

In [None]:
def initalize_request(url, headers=({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0'})):
    return requests.get(url=url, headers=headers)

In [None]:
def get_dictionary_page_source(url):
    response = initalize_request(url)

    soup = None
    status = False
    if response.status_code == 200 and response.url == url:
        soup = BeautifulSoup(response.content, 'html.parser')
        status = True

    return soup, status

In [None]:
def get_word_panel_by_type(word, word_type=None, dictionary_url=cambridge_url):
    word_page_url = cambridge_url + r'/dictionary/english/' + word.replace(' ', '-').lower()
    word_page_src, status_code = get_dictionary_page_source(word_page_url)

    if not status_code:
        return None
    
    # select panel by word types
    word_panels = word_page_src.find_all('div', {'class': 'pr dictionary',
                                                 'role': 'tabpanel'})
    res_panel = word_panels[0]
    if word_type:
        for panel in word_panels:
            type = panel.find('span', {'class': 'pos dpos'}).get_text()
            if type == word_type:
                return panel
    
    return res_panel

In [None]:
def get_sound(soup, file_path, dictionary_url=cambridge_url, included_pattern=None):
    if os.path.exists(file_path):
        return

    # get all download source
    source_tags = soup.find_all('source')
    download_url = source_tags[0]['src']
    if included_pattern:
        for tag in source_tags:
            if re.search(included_pattern, tag['src']):
                download_url = tag['src']
                break
             
    download_url = dictionary_url + download_url

    download_response = initalize_request(download_url)
    with open(file_path, 'wb') as f:
        f.write(download_response.content)

    return file_path

In [None]:
def get_phonetic(soup):
    phonetic = None
    phonetic_tags = soup.find_all('span', {'class': 'pron dpron'})
    if phonetic_tags:
        return phonetic_tags[1].get_text()
    
    return phonetic

In [None]:
def get_english_meaning(soup):
    meaning = soup.find('div', 'def ddef_d db').get_text().strip()
    if not meaning[-1].isalpha():
        meaning = meaning[:-1]
    
    return meaning

In [None]:
def get_examples(soup):
    examples_box = soup.find('div', 'def-body ddef_b')
    if not examples_box:
        return None
    
    examples = examples_box.find_all('div', 'examp dexamp')
    res = []
    for exp in examples:
        use_case = exp.find('span', 'lu dlu')
        exp_sentence = exp.find('span', 'eg deg').get_text()

        complete_exp_sentence = '- '
        if use_case:
            complete_exp_sentence += f'({use_case.get_text()}) | '
        
        complete_exp_sentence += exp_sentence + '\n\n'
        res.append(complete_exp_sentence)

    return ''.join(res)

In [None]:
def get_images(soup):
    

In [None]:
def get_phonetic_sound_engmean_examples(vocabs_arr, dictionary_url=cambridge_url):
    res_ipa = []
    res_sound = []
    res_eng_mean = []
    res_example = []

    for vocab in vocabs_arr:
        vocab_page_url = cambridge_url + r'/dictionary/english/' + vocab.replace(' ', '-').lower()
        page_src, status = get_vocabPageSrc(vocab_page_url)

        if status:
            res_ipa.append(None)
            res_sound.append(None)
            res_eng_mean.append(None)
            res_example.append('')
            continue
        
        # get phonetic
        phonetic = None
        if page_src.find('span', 'pron dpron'): phonetic = page_src.find('span', 'pron dpron').text
        res_ipa.append(phonetic)

        # get sound
        filepath = os.getcwd() + r'/../data/audio/' + vocab + '.mp3'
        res_sound.append('[sound:' + filepath + ']')
        get_sound(page_src, filepath)

        # get english meaming & examples
        definition_box = page_src.find('div', class_='def-block ddef_block')
        eng_mean = definition_box.find('div', 'def ddef_d db').text.strip()
        if eng_mean[-1] != '.' and not eng_mean[-1].isalpha():
            eng_mean = eng_mean[:-1]
        res_eng_mean.append(eng_mean)
        examples_src = definition_box.find_all('div', 'examp dexamp')
        
        examples = ''
        for example in examples_src:
            if examples != '': examples += '\n\n';
            examples += '- ' + example.text.strip()

        res_example.append(examples)

    return res_ipa, res_sound, res_eng_mean, res_example

In [None]:
def create_cloze(word):
    new_word = word[0] + re.sub('\w', '_', word[1:])

    return new_word

In [None]:
def create_full_wordType(short_wordType):
    res = ''
    if short_wordType == 'n': res = 'noun'
    elif short_wordType == 'v': res = 'verb'
    elif short_wordType == 'adj': res = 'adjective'
    else: res = 'adverb'

    return res

In [None]:
def check_existane(entry, existed_df):
    return ((existed_df.Vocabulary == entry.Vocabulary) & (existed_df.Type == entry.Type)).any()

### Crawling function

In [None]:
def crawl(vocabs_df, existed_path=None):
    vocabs_df['Vocabulary'] = vocabs_df.Vocabulary.apply(lambda x : x.lower(), axis=1)
    vocabs_df['Type'] = vocabs_df['Type'].apply(create_full_wordsType, 0)
    need2Import_df = vocabs_df.copy()
    if existed_path:
        existed_df = pd.read_csv(existed_path, names=header, index_col=False)
        mask = vocabs_df.apply(lambda x : check_existane(x, existed_df), axis=1)
        need2Import_df = vocabs_df.iloc[~mask.to_numpy()]

    # crawl
    ipa_arr, soundPath_arr, eng_mean_arr, example_arr = get_phonetic_sound_engmean_examples(need2Import_df.Vocabulary.values)

    need2Import_df.insert(need2Import_df.shape[1], 'Cloze', need2Import_df['Vocabulary'].apply(create_cloze, 0))
    need2Import_df.insert(need2Import_df.shape[1], 'Phonetic', ipa_arr)
    need2Import_df.insert(need2Import_df.shape[1], 'Audio', soundPath_arr)
    need2Import_df.insert(need2Import_df.shape[1], 'English meaning', eng_mean_arr)
    need2Import_df.insert(need2Import_df.shape[1], 'Example', example_arr)
    
    # merge & update
    final_df = need2Import_df.copy()
    if existed_path:
        final_df = pd.merge(vocabs_df, existed_df, how='left', on=['Vocabulary', 'Type'])
        start = 0
        for i in range(len(mask)):
            # update new words into the previous version
            if not mask[i]:
                for col in ['Cloze', 'Phonetic', 'Audio', 'English meaning', 'Example']:
                    final_df.at[i, col] = need2Import_df.iloc[start][col]
                start += 1

            # update vietnamese meaning
            if (
                isinstance(final_df.iloc[i]["Vietnamese meaning_y"], str)
                and final_df.iloc[i]["Vietnamese meaning_x"] != final_df.iloc[i]["Vietnamese meaning_y"]
            ):
                final_df.iloc[i]["Vietnamese meaning_x"] += "\n" * 2 + final_df.iloc[i]["Vietnamese meaning_y"]
    
        # delete the abundant vietnamese meaning column and rename the another one
        final_df.drop('Vietnamese meaning_y', axis=1, inplace=True)
        final_df.rename(columns={'Vietnamese meaning_x':'Vietnamese meaning'}, inplace=True)


        
    return need2Import_df, final_df

In [None]:
def re_crawl(df, index, url):
    # fill phonetic, english meaning and vietnamese meaning
    phonetic = audio_path = eng_mean = None
    examples = []

    page_src, status = get_vocabPageSrc(url)
    if status:
        return
        
    # get phonetic
    phonetic = page_src.find('span', 'pron dpron').text

    # get sound
    filepath = os.getcwd() + r'/data/audio/' + df.iloc[index].vocabulary + '.mp3'
    audio_path = ('[sound:' + filepath + ']')
    get_sound(page_src, filepath)

    # get english meaming & examples
    definition_box = page_src.find('div', class_='def-block ddef_block')
    eng_mean = definition_box.find('div', 'def ddef_d db').text.strip()
    if eng_mean[-1] != '.' and not eng_mean[-1].isalpha():
        eng_mean = eng_mean[:-1]
    examples_src = definition_box.find_all('div', 'examp dexamp')
    
    examples = ''
    for example in examples_src:
        if examples != '': examples += '\n\n';
        examples += '- ' + example.text.strip()

    return phonetic, audio_path, eng_mean, examples

## Crawl

In [None]:
vocabs_df = pd.read_csv(input('Path to new vocabularies file: '))
existed_path = input('Path to lasted file (if this is the first file, please enter 0): ')

In [None]:
need2Import_df, final_df = crawl(vocabs_df, existed_path)

In [None]:
final_df.shape, need2Import_df.shape

In [None]:
reodered_cols = ['Vocabulary', 'Type', 'Cloze', 'Phonetic', 'Audio', 'English meaning', 'Vietnamese meaning', 'Example']
need2Import_df = need2Import_df[reodered_cols]
final_df = final_df[reodered_cols]

## Export to result file

In [None]:
exported = input('Do you want to export to file (yes/no): ')

assert exported == 'yes'

In [None]:
for k, v in {'lasted file': (final_df, r'../crawled_data/'), '2Import_df': (need2Import_df, r'../new2import_data/')}.items():
    exported_filepath = input(f'Your {k} name:')
    df, exported_folder = v
    
    filepath = exported_folder + exported_filepath
    if os.path.isfile(filepath + '.csv'):
        filepath += str(datetime.now().strftime("%d-%m-%Y %H:%M:%S"))

    df.to_csv(filepath + '.csv', sep=',', header=False, index=False)

## Re - crawl

In this section, we will recrawl words that can not be crawled in previous section

In [None]:
assert input('Do you want to re-crawl (yes/no): ') == 'yes'

In [None]:
update_values = re_crawl(vocabs_df, 1, r'https://dictionary.cambridge.org/vi/dictionary/english/stepfamily?q=step+family')
update_params = ['Phonetic', 'Audio', 'English meaning', 'Example']

for i in range(4):
    vocabs_df.at[1, update_params[i]] = update_values[i]