In [1]:
import re
import sqlite3

import pandas as pd
import requests
from bs4 import BeautifulSoup

## Get Vocab List

In [2]:
# Load SQLite DB
conn = sqlite3.connect('vocab/collection.anki2')
curs = conn.cursor()
curs.execute('SELECT flds FROM notes')

words = curs.fetchall()
conn.close()

In [3]:
# Show what it looks like now
print(words[:5])

[('abbreviate [sound:abbreviate.mp3]\x1f\x1f\x1f\x1fBecause we were running out of time, the lecturer had to abbreviate her speech.\x1fshorten',), ('abate [sound:abate.mp3]\x1f\x1f\x1f\x1fRather than leaving immediately, they waited for the storm to abate.\x1fsubside or moderate',), ('abortive [sound:abortive.mp3]\x1f\x1f\x1f\x1fWe had to abandon our abortive attempts.\x1funsuccessful; fruitless',), ('absolute [sound:absolute.mp3]\x1f\x1f\x1f\x1fAlthough the King of Siam was an absolute monarch, he did not want to behead his unfaithful wife without absolute evidence of her infidelity.\x1fcomplete; totally unlimited; certain',), ('abstract\x1f/ˈæbstrækt/\x1f<span style="font-family: Georgia;font-size: 12px">&nbsp;[sound:abstract.mp3]</span>\x1fadj\x1fTo him, hunger was an abstract concept; he had never missed a meal.<div><br /></div><div>The research shows that pre-school children are capable of thinking in abstract terms.&nbsp;</div>\x1ftheoretical; not concrete; non-representational',

In [4]:
# Breakup strings for loading into dataframe
words = [tupl[0] for tupl in words] # break tuples
words = [word.split('\x1f') for word in words]

In [5]:
# Show the current state of the data and confirm that the data is uniform
print(words[4])

for word in words:
    assert(len(word) == 6)

['abstract', '/ˈæbstrækt/', '<span style="font-family: Georgia;font-size: 12px">&nbsp;[sound:abstract.mp3]</span>', 'adj', 'To him, hunger was an abstract concept; he had never missed a meal.<div><br /></div><div>The research shows that pre-school children are capable of thinking in abstract terms.&nbsp;</div>', 'theoretical; not concrete; non-representational']


In [6]:
# Transfer to a DataFrame
columns = ['word', 'pronunciation', 'sound', 'part of speech', 'sentence', 'definition']
vocab = pd.DataFrame(words, columns=columns)

In [7]:
# Show the DataFrame
display(vocab.head())

Unnamed: 0,word,pronunciation,sound,part of speech,sentence,definition
0,abbreviate [sound:abbreviate.mp3],,,,"Because we were running out of time, the lectu...",shorten
1,abate [sound:abate.mp3],,,,"Rather than leaving immediately, they waited f...",subside or moderate
2,abortive [sound:abortive.mp3],,,,We had to abandon our abortive attempts.,unsuccessful; fruitless
3,absolute [sound:absolute.mp3],,,,Although the King of Siam was an absolute mona...,complete; totally unlimited; certain
4,abstract,/ˈæbstrækt/,"<span style=""font-family: Georgia;font-size: 1...",adj,"To him, hunger was an abstract concept; he had...",theoretical; not concrete; non-representational


## Cleanup the word list

In [14]:
# Get rid of sound tags
re_sound = r'\[sound:.*.mp3\]'
vocab.loc[:, 'word'] = vocab['word'].str.replace(re_sound, '', regex=True)

# Get rid of '&nbsp' after word
re_stuff_after = r'\s*&nbsp.*'
vocab.loc[:, 'word'] = vocab['word'].str.replace(re_stuff_after, '', regex=True)

# Check for uniqueness
display(vocab[vocab.duplicated(subset='word', keep=False)])

Unnamed: 0,word,pronunciation,sound,part of speech,sentence,definition
780,countenance,/kaʊntənəns/,[sound:countenance.mp3],verb,He refused to countenance such rude behavior o...,"approve; tolerate, consent to"
781,countenance,/ˈkaʊntənəns/,[sound:countenance.mp3],noun,"Whe Jose saw his newborn daughter, a proud smi...",face
1292,exploit,&nbsp;/ɪkˈsplɔɪt/,"[sound:exploit verb - Definition, pictures, pr...",noun,Raoul Wallenberg was noted for his exploits in...,"deed or action, particularly a brave deed"
1293,exploit,/ɪkˈsplɔɪt/,"<span style=""font-family: Georgia;font-size: 1...",verb,Caesar Chavez fought attempts to exploit migra...,"make use of, sometimes unjustly"
1359,fester,&nbsp;/ˈfestər/,"[sound:fester verb - Definition, pictures, pro...",verb,"When her finger began to fester, the doctor la...",generate pus
1360,fester,/ˈfestər/,"<span style=""font-family: Georgia;font-size: 1...",verb - feeling,"Joe's insult festered in Anne's mind for days,...","rankle, produce irritation or resentment"
1394,fleece,/fliːs/,"[sound:fleece noun - Definition, pictures, pro...",noun,"They shear sheep of their fleece, which they t...",wool coat of a sheep
1395,fleece,&nbsp;/fliːs/,"[sound:fleece noun - Definition, pictures, pro...",verb,The tricksters fleeced him of his inheritance....,rob; plunder;&nbsp;to take a lot of money from...
1414,foil,&nbsp;/fɔɪl/,"[sound:foil noun - Definition, pictures, pronu...",noun,"In ""Star Wars,"" dark, evil Darth Vader is a pe...",contrast
1415,foil,&nbsp;/fɔɪl/,"[sound:foil verb - Definition, pictures, pronu...",verb,"In the end, Skywalker is able to foil Vader's ...","defeat; frustrate;<font color=""#0000ff""><b> th..."


In [9]:
display(vocab)

Unnamed: 0,word,pronunciation,sound,part of speech,sentence,definition
0,abbreviate,,,,"Because we were running out of time, the lectu...",shorten
1,abate,,,,"Rather than leaving immediately, they waited f...",subside or moderate
2,abortive,,,,We had to abandon our abortive attempts.,unsuccessful; fruitless
3,absolute,,,,Although the King of Siam was an absolute mona...,complete; totally unlimited; certain
4,abstract,/ˈæbstrækt/,"<span style=""font-family: Georgia;font-size: 1...",adj,"To him, hunger was an abstract concept; he had...",theoretical; not concrete; non-representational
...,...,...,...,...,...,...
3753,zany,/ˈzeɪni/,[sound:zany.mp3],adjective,I can watch the Marx brothers' zany antics for...,"crazy; comic,<b><font color=""#0000ff""> wacky</..."
3754,zeal,/ziːl/,"<span style=""font-family: Georgia;font-size: 1...",noun,Wang's zeal was contagious; soon all his fello...,eager enthusiasm
3755,zealot,/ˈzelət/,&nbsp;[sound:zealot.mp3],noun-person,It is good to have a few zealots in our group ...,"<b><font color=""#0000ff"">fanatic</font></b>; p..."
3756,zenith,/ˈzenɪθ/,"<span style=""font-family: Georgia;font-size: 1...",noun,"When the sun was at its zenith, the glare was ...",point directly overhead in the sky; summit
