In [1]:
import concurrent.futures
import re
import sqlite3
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup

# Get Vocab List

Our list was kindly uploaded to AnkiWeb at https://ankiweb.net/shared/info/1286483637. The only thing is I don't like the definitions on some of them, so I'm going to exchange them with ones from Oxford Learner's dictionary and then reupload. I'm choosing Oxford because Google does, and I'm using the Learner's dictionary specifically for ESL learners, as I'm pretty sure that is the group this deck originates from. Also, it's free (that's a big reason -- Oxford is *expensive*.

In [2]:
# Load SQLite DB
conn = sqlite3.connect('vocab deck files/collection.anki2')
curs = conn.cursor()
curs.execute('SELECT flds FROM notes')

words = curs.fetchall()
conn.close()

In [3]:
# Show what it looks like now
print(words[:5])

[('abbreviate [sound:abbreviate.mp3]\x1f\x1f\x1f\x1fBecause we were running out of time, the lecturer had to abbreviate her speech.\x1fshorten',), ('abate [sound:abate.mp3]\x1f\x1f\x1f\x1fRather than leaving immediately, they waited for the storm to abate.\x1fsubside or moderate',), ('abortive [sound:abortive.mp3]\x1f\x1f\x1f\x1fWe had to abandon our abortive attempts.\x1funsuccessful; fruitless',), ('absolute [sound:absolute.mp3]\x1f\x1f\x1f\x1fAlthough the King of Siam was an absolute monarch, he did not want to behead his unfaithful wife without absolute evidence of her infidelity.\x1fcomplete; totally unlimited; certain',), ('abstract\x1f/ˈæbstrækt/\x1f<span style="font-family: Georgia;font-size: 12px">&nbsp;[sound:abstract.mp3]</span>\x1fadj\x1fTo him, hunger was an abstract concept; he had never missed a meal.<div><br /></div><div>The research shows that pre-school children are capable of thinking in abstract terms.&nbsp;</div>\x1ftheoretical; not concrete; non-representational',

In [4]:
# Breakup strings for loading into dataframe
words = [tupl[0] for tupl in words] # break tuples
words = [word.split('\x1f') for word in words]

In [5]:
# Show the current state of the data and confirm that the data is uniform
print(words[4])
for word in words:
    assert(len(word) == 6)

['abstract', '/ˈæbstrækt/', '<span style="font-family: Georgia;font-size: 12px">&nbsp;[sound:abstract.mp3]</span>', 'adj', 'To him, hunger was an abstract concept; he had never missed a meal.<div><br /></div><div>The research shows that pre-school children are capable of thinking in abstract terms.&nbsp;</div>', 'theoretical; not concrete; non-representational']


In [6]:
# Transfer to a DataFrame
columns = ['word', 'pronunciation', 'sound', 'part of speech', 'sentence', 'definition']
vocab = pd.DataFrame(words, columns=columns)

In [7]:
# Show the DataFrame
display(vocab.head())

Unnamed: 0,word,pronunciation,sound,part of speech,sentence,definition
0,abbreviate [sound:abbreviate.mp3],,,,"Because we were running out of time, the lectu...",shorten
1,abate [sound:abate.mp3],,,,"Rather than leaving immediately, they waited f...",subside or moderate
2,abortive [sound:abortive.mp3],,,,We had to abandon our abortive attempts.,unsuccessful; fruitless
3,absolute [sound:absolute.mp3],,,,Although the King of Siam was an absolute mona...,complete; totally unlimited; certain
4,abstract,/ˈæbstrækt/,"<span style=""font-family: Georgia;font-size: 1...",adj,"To him, hunger was an abstract concept; he had...",theoretical; not concrete; non-representational


# Cleanup the Vocab list

The words have some schmutz in them that we're going to get rid of before continuing to Oxford. We're also going to drop duplicates and just pull all definitons for all parts of speech from Oxford. Is this misguided? I mean . . . yeah, probably. Later on I'm going to limit our requests to one-per-second so this whole thing is going to take over an hour to run anyway.

In [8]:
# Get rid of sound tags
re_sound = r'\[sound:.*.mp3\]'
vocab.loc[:, 'word'] = vocab['word'].str.replace(re_sound, '', regex=True)

# Get rid of '&nbsp' after word
re_stuff_after = r'\s*&nbsp.*'
vocab.loc[:, 'word'] = vocab['word'].str.replace(re_stuff_after, '', regex=True)

# Show the words that are duplicate within the df
display(vocab[vocab.duplicated(subset='word', keep=False)])
# Make the df unique
vocab = vocab.drop_duplicates(subset='word')

# Get the words as a list (again?)
words = vocab['word'].tolist()

Unnamed: 0,word,pronunciation,sound,part of speech,sentence,definition
780,countenance,/kaʊntənəns/,[sound:countenance.mp3],verb,He refused to countenance such rude behavior o...,"approve; tolerate, consent to"
781,countenance,/ˈkaʊntənəns/,[sound:countenance.mp3],noun,"Whe Jose saw his newborn daughter, a proud smi...",face
1292,exploit,&nbsp;/ɪkˈsplɔɪt/,"[sound:exploit verb - Definition, pictures, pr...",noun,Raoul Wallenberg was noted for his exploits in...,"deed or action, particularly a brave deed"
1293,exploit,/ɪkˈsplɔɪt/,"<span style=""font-family: Georgia;font-size: 1...",verb,Caesar Chavez fought attempts to exploit migra...,"make use of, sometimes unjustly"
1359,fester,&nbsp;/ˈfestər/,"[sound:fester verb - Definition, pictures, pro...",verb,"When her finger began to fester, the doctor la...",generate pus
1360,fester,/ˈfestər/,"<span style=""font-family: Georgia;font-size: 1...",verb - feeling,"Joe's insult festered in Anne's mind for days,...","rankle, produce irritation or resentment"
1394,fleece,/fliːs/,"[sound:fleece noun - Definition, pictures, pro...",noun,"They shear sheep of their fleece, which they t...",wool coat of a sheep
1395,fleece,&nbsp;/fliːs/,"[sound:fleece noun - Definition, pictures, pro...",verb,The tricksters fleeced him of his inheritance....,rob; plunder;&nbsp;to take a lot of money from...
1414,foil,&nbsp;/fɔɪl/,"[sound:foil noun - Definition, pictures, pronu...",noun,"In ""Star Wars,"" dark, evil Darth Vader is a pe...",contrast
1415,foil,&nbsp;/fɔɪl/,"[sound:foil verb - Definition, pictures, pronu...",verb,"In the end, Skywalker is able to foil Vader's ...","defeat; frustrate;<font color=""#0000ff""><b> th..."


# "Go to Oxford," . . .
they said. "It'll be fun," they said.

I did check the robots.txt file before getting started here. They don't disallow what I'm doing so I'm going to spinup some threads and hopefully this won't actually take an hour.

In [9]:
# Common variables
base_url = 'https://www.oxfordlearnersdictionaries.com/us/definition/english/{word}'
headers = {'Content-Type': 'text',
           'User-Agent': 'ozilla/5.0 (X11; Linux x86_64; rv:103.0) Gecko/20100101 Firefox/103.0'}

### Scrape functions for compicated retrievals

In [10]:
def get_pronunciation(word, phon_soup):
    pron = {}
    
    # Get phonics of both
    phon_br = phon_soup.find('div', class_='phons_br')
    pron['pronunciation - british'] = phon_br.find('span', class_='phon').text
    
    phon_na = phon_soup.find('div', class_='phons_n_am')
    pron['pronunciation - north america'] = phon_na.find('span', class_='phon').text
    
    # Get the pronunciation .mp3 files
    regions = {'uk': phon_br, 'us': phon_na}
    for reg in regions:
        url = regions[reg].find('div', class_=f'sound audio_play_button pron-{reg} icon-audio')
        url = url.attrs['data-src-mp3']
        
        mp3_file = f'sounds/{word}_{reg}.mp3'
        if Path(mp3_file).is_file():
            continue
        
        with open(mp3_file, 'wb') as mp3:
            r = requests.get(url, headers=headers)
            if r.status_code != 200:
                msg = f"Pronunciation request returned non-200 status code ({r.status_code}) on word: {word}"
                raise Exception(msg)
            mp3.write(r.content)
    
    return pron

In [11]:
def get_definitions(word, define_soup):
    single_soup = define_soup.find('ol', class_='sense_single')
    multi_soup = define_soup.find('ol', class_='senses_multiple')
    definition = None
    # Single Definition
    if single_soup:
        definition = single_soup.find('span', class_='def').text
    elif multi_soup:
        definition = 
        
    

### Main scrape functions

In [12]:
def get_all_pos(word):
    word_dict = {}
    
    url = base_url.format(word=word)
    r = requests.get(url, headers=headers)
    
    if r.status_code != 200:
        msg = f"Definition request returned non-200 status code ({r.status_code}) on word: {word}"
        raise Exception(msg)
        
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Enumerate all parts-of-speech
    all_results = [url]
    related_soup = soup.find('div', id_='relatedentries')
    if related_soup:
        
    
        
        

def scrape(word_soup):
    oxford_word = {}
    oxford_word['word'] = word
    
    # Part of Speech
    oxford_word['part of speech'] = word_soup.find('span', class_='pos').text
    
    # Pronunciation
    phon = word_soup.find('span', class_='phonetics')
    pron = get_pronunciation(word, phon)
    oxford_word = {**oxford_word, **pron}
    
    # Definitions and Phrases
    define = soup.find('span', class_='def')
    oxford_word['definition'] = get_definitions(word, define)
    
    oxford_words.append(oxford_word)
    
    
    return oxford_words

In [16]:
print(scrape('glass_2'))

{'word': 'glass_2', 'part of speech': 'verb', 'pronunciation - british': '/ɡlɑːs/', 'pronunciation - north america': '/ɡlæs/', 'definitions': None}
