## Import libraries

In [1]:
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

import pandas as pd
import urllib
import requests

## Define functions and global variable

### Global variable

In [2]:
# Global variable
cambridge_url = 'https://dictionary.cambridge.org'
oxford_url = 'https://www.oxfordlearnersdictionaries.com'


### Selenium driver

In [3]:
# Initialize chrome driver
def initialize_driver(): 
    chrome_options = webdriver.ChromeOptions()
    download_path = r'/Users/btp712/Code/Anki crawler/audio/'
    prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096,
           "download.default_directory": download_path}
    chrome_options.add_experimental_option("prefs", prefs) # Manage image loading and run on disk cache
    # chrome_options.add_argument("--headless") # Runs Chrome in headless mode
    chrome_options.add_argument('--no-sandbox') # Bypass OS security model
    chrome_options.add_argument('--disable-dev-shm-usage') # overcome limited resource problems
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

    return driver

### Crawling functions

In [4]:
def read_vocabs_file(filename):
    res = []
    with open(filename) as file:
        for v in file:
            res.append(v.strip())

    return res

In [5]:
def get_vocabPageSrc(url):
    page = requests.get(url, headers=({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0'}))

    soup = None
    status = True
    if page.status_code == 200 and page.url == url:
        soup = BeautifulSoup(page.content, 'html.parser')
        status = False

    return soup, status

In [7]:
def get_phonetic_sound_engmean_examples(vocabs_arr):
    res_eng_mean = []
    res_example = []

    for vocab in vocabs_arr:
        vocab_page_url = cambridge_url + r'/dictionary/english/' + vocab.replace(' ', '-').lower()
        page_src, status = get_vocabPageSrc(vocab_page_url)

        if status:
            res_eng_mean.append(None)
            res_example.append('')
            continue

        # get english meaming & examples
        definition_box = page_src.find('div', class_='def-block ddef_block')
        eng_mean = definition_box.find('div', 'def ddef_d db').text.strip()
        if eng_mean[-1] != '.' and not eng_mean[-1].isalpha():
            eng_mean = eng_mean[:-1]
        res_eng_mean.append(eng_mean)
        examples_src = definition_box.find_all('div', 'examp dexamp')
        
        examples = ''
        for example in examples_src:
            if examples != '': examples += '\n\n';
            examples += '- ' + example.text.strip()

        res_example.append(examples)

    return res_eng_mean, res_example

In [8]:
import re
def create_cloze(word):
    new_word = word[0] + re.sub('\w', '_', word[1:])

    return new_word

In [9]:
def create_full_wordsType(short_wordType):
    res = ''
    if short_wordType == 'n': res = 'noun'
    elif short_wordType == 'v': res = 'verb'
    elif short_wordType == 'adj': res = 'adjective'
    else: res = 'adverb'

    return res

In [10]:
def crawl(vocabs_arr, vocabs_df):
    ipa_arr, soundPath_arr, eng_mean_arr, example_arr = get_phonetic_sound_engmean_examples(vocabs_arr)
    vocabs_df['Type'] = vocabs_df['Type'].apply(create_full_wordsType, 0)
    vocabs_df['Cloze'] = vocabs_df['Vocabulary'].apply(create_cloze, 0)
    vocabs_df['Phonetic'] = ipa_arr
    vocabs_df['Audio'] = soundPath_arr
    vocabs_df['English meaning'] = eng_mean_arr
    vocabs_df['Example'] = example_arr

In [11]:
def re_crawl(df, index, url):
    # fill phonetic, english meaning and vietnamese meaning
    phonetic = audio_path = eng_mean = None
    examples = []

    page_src, status = get_vocabPageSrc(url)
    if status:
        return
        
    # get phonetic
    phonetic = page_src.find('span', 'pron dpron').text

    # get sound
    filepath = os.getcwd() + r'/data/audio/' + df.iloc[index].vocabulary + '.mp3'
    audio_path = ('[sound:' + filepath + ']')
    get_sound(page_src, filepath)

    # get english meaming & examples
    definition_box = page_src.find('div', class_='def-block ddef_block')
    eng_mean = definition_box.find('div', 'def ddef_d db').text.strip()
    if eng_mean[-1] != '.' and not eng_mean[-1].isalpha():
        eng_mean = eng_mean[:-1]
    examples_src = definition_box.find_all('div', 'examp dexamp')
    
    examples = ''
    for example in examples_src:
        if examples != '': examples += '\n\n';
        examples += '- ' + example.text.strip()

    return phonetic, audio_path, eng_mean, examples

## Crawl

In [12]:
# vocabs_file = input("new words filepath: ")
# vocabs_arr = read_vocabs_file(vocabs_file)


# vocabs_df = pd.read_csv(vocabs_file)
vocabs_df = pd.read_csv('./data/New words/Vocabulary/Collin/Listening for IELTS/Unit 6 - Fame.csv')

In [13]:
vocabs_df

Unnamed: 0,Vocabulary,Type,Vietnamese meaning
0,Fame,n,Danh tiếng
1,Famous,adj,Nổi tiếng
2,Act,n,- Hành động\r\n- Cư xử\r\n- Diễn xuất
3,Actress,n,Diễn viên nữ
4,Actor,n,Diễn viên nam
5,Director,n,- Đạo diễn\r\n- Giám đốc\r\n- Người chỉ huy
6,Agent,n,
7,Musician,n,Nhạc sĩ
8,Writer,n,Tác giả
9,Program,v,Lập trình


In [14]:
crawl(vocabs_df.Vocabulary.values, vocabs_df)

In [15]:
vocabs_df = vocabs_df[['Vocabulary', 'Type', 'Cloze', 'Phonetic', 'Audio', 'English meaning', 'Vietnamese meaning', 'Example']]

vocabs_df

Unnamed: 0,Vocabulary,Type,Cloze,Phonetic,Audio,English meaning,Vietnamese meaning,Example
0,Fame,noun,F___,/feɪm/,[sound:/Users/btp712/English/Anki/data/audio/F...,the state of being known or recognized by many...,Danh tiếng,- rise to fame She first rose to fame as a sin...
1,Famous,adjective,F_____,/ˈfeɪ.məs/,[sound:/Users/btp712/English/Anki/data/audio/F...,known and recognized by many people,Nổi tiếng,- a famous actress/building\n\n- famous for Ma...
2,Act,noun,A__,/ækt/,[sound:/Users/btp712/English/Anki/data/audio/A...,to behave in the stated way,- Hành động\r\n- Cư xử\r\n- Diễn xuất,- act like Don't be so silly - you're acting l...
3,Actress,noun,A______,/ˈæk.trəs/,[sound:/Users/btp712/English/Anki/data/audio/A...,a woman who pretends to be someone else while ...,Diễn viên nữ,- She's the highest-paid actress in Hollywood.
4,Actor,noun,A____,/ˈæk.tər/,[sound:/Users/btp712/English/Anki/data/audio/A...,someone who pretends to be someone else while ...,Diễn viên nam,"- ""Who's your favourite actor?"" ""Robert de Nir..."
5,Director,noun,D_______,/daɪˈrek.tər/,[sound:/Users/btp712/English/Anki/data/audio/D...,"a manager of an organization, company, college...",- Đạo diễn\r\n- Giám đốc\r\n- Người chỉ huy,- the board of directors\n\n- She has become t...
6,Agent,noun,A____,/ˈeɪ.dʒənt/,[sound:/Users/btp712/English/Anki/data/audio/A...,a person who acts for or represents another,,- Please contact our agent in Spain for furthe...
7,Musician,noun,M_______,/mjuːˈzɪʃ.ən/,[sound:/Users/btp712/English/Anki/data/audio/M...,"someone who is skilled in playing music, usual...",Nhạc sĩ,- The concert features dancers and musicians o...
8,Writer,noun,W_____,/ˈraɪ.tər/,[sound:/Users/btp712/English/Anki/data/audio/W...,a person who writes books or articles to be pu...,Tác giả,- a travel/sports/fiction/crime writer\n\n- Sh...
9,Program,verb,P______,/ˈprəʊ.ɡræm/,[sound:/Users/btp712/English/Anki/data/audio/P...,a series of instructions that can be put into ...,Lập trình,- a computer program\n\n- She's written a prog...


## Re - crawl

In this section, we will recrawl words that can not be crawled in previous section

In [16]:
update_values = re_crawl(vocabs_df, 1, r'https://dictionary.cambridge.org/vi/dictionary/english/stepfamily?q=step+family')
update_params = ['Phonetic', 'Audio', 'English meaning', 'Example']

for i in range(4):
    vocabs_df.at[1, update_params[i]] = update_values[i]

AttributeError: 'Series' object has no attribute 'vocabulary'

In [None]:
vocabs_df

Unnamed: 0,vocabulary,Cloze,Phonetic,Audio,English meaning,Vietnamese meaning,Example
0,extended family,e_______ ______,/ɪkˌsten.dɪd ˈfæm.əl.i/,[sound:/Users/btp712/English/Anki/data/audio/e...,"a family unit that includes grandmothers, gran...",,
1,step-family,s___-______,/ˈstepˌfæm.əl.i/,[sound:/Users/btp712/English/Anki/data/audio/s...,a family that is formed by two people and the ...,,- With one in three marriages ending in divorc...
2,nuclear family,n______ ______,/ˌnjuː.klɪə ˈfæm.əl.i/,[sound:/Users/btp712/English/Anki/data/audio/n...,a family consisting of two parents and their c...,,
3,one-parent family,o__-______ ______,/ˌwʌn.peə.rənt ˈfæm.əl.i/,[sound:/Users/btp712/English/Anki/data/audio/o...,a family that includes either a mother or a fa...,,
4,inherit,i______,/ɪnˈher.ɪt/,[sound:/Users/btp712/English/Anki/data/audio/i...,"to receive money, a house, etc. from someone a...",,- Who will inherit the house when he dies?\n\n...
...,...,...,...,...,...,...,...
71,proportion,p_________,/prəˈpɔː.ʃən/,[sound:/Users/btp712/English/Anki/data/audio/p...,the number or amount of a group or part of som...,,- Children make up a large proportion of the w...
72,disproportion,d____________,/ˌdɪs.prəˈpɔː.ʃən/,[sound:/Users/btp712/English/Anki/data/audio/d...,the fact of being too large or too small in re...,,- The disproportion between the number of vote...
73,disproportionate,d_______________,/ˌdɪs.prəˈpɔː.ʃən.ət/,[sound:/Users/btp712/English/Anki/data/audio/d...,too large or too small in comparison to someth...,,- There are a disproportionate number of girls...
74,burden,b_____,/ˈbɜː.dən/,[sound:/Users/btp712/English/Anki/data/audio/b...,a heavy load that you carry,,- The little donkey struggled under its heavy ...


## Export to result file

In [None]:
exported = int(input('Do you want to export to file: '))

assert exported == 1

AssertionError: 

In [17]:
exported_filepath = input()
vocabs_df.to_csv(exported_filepath + '.csv', sep=',', header=False, index=False)