In [40]:
import pickle
import pandas as pd
import numpy as np
import os
import json
import random

In [59]:
refined_words = pickle.load(open('all_words_with_freq_and_meaning.pkl','rb'))
word_frame = pd.DataFrame(refined_words)

In [66]:
pd.set_option('display.max_rows',10000)
word_frame[word_frame.freq<100].head(800)

Unnamed: 0,freq,word,meaning
26,64.82,absolute,perfect or complete or pure
48,24.93,accord,concurrence of opinion
77,43.1,address,the place where a person or organization can b...
264,56.65,apparent,clearly revealed to the mind or the senses or ...
422,29.83,bank,financial institution that accepts deposits an...
433,58.69,base,lowest support of a structure
468,72.06,benefit,something that aids or promotes well-being
494,66.21,bid,propose a payment
505,67.86,bitter,causing a sharp and acrid taste experience
548,88.04,bore,"make a hole, especially with a pointed power o..."


In [57]:
pickle_storage = {}
code_series = pd.qcut(word_frame.freq,8).cat.codes
word_frame.word = word_frame.word.str.capitalize()
word_frame['code'] = code_series
word_frame['difficulty'] = word_frame.code.replace({0:1,2:1,3:1,4:1,5:2,6:2,7:3})
for difficulty, df in word_frame.groupby('difficulty'):
    pickle_storage[str(difficulty)] = df[['word','meaning']].to_dict(orient='records')
pickle_storage 

{'1': [{'word': 'Abase', 'meaning': 'cause to feel shame'},
  {'word': 'Abash', 'meaning': 'cause to be embarrassed'},
  {'word': 'Abate', 'meaning': 'become less in amount or intensity'},
  {'word': 'Abbreviate', 'meaning': 'shorten'},
  {'word': 'Aberration',
   'meaning': 'a state or condition markedly different from the norm'},
  {'word': 'Abet',
   'meaning': 'assist or encourage, usually in some wrongdoing'},
  {'word': 'Abhor', 'meaning': 'find repugnant'},
  {'word': 'Abide', 'meaning': 'dwell'},
  {'word': 'Abject', 'meaning': 'of the most contemptible kind'},
  {'word': 'Abjure',
   'meaning': 'formally reject or disavow a formerly held belief'},
  {'word': 'Abode',
   'meaning': 'any address at which you dwell more than temporarily'},
  {'word': 'Abolish', 'meaning': 'do away with'},
  {'word': 'Abominable', 'meaning': 'unequivocally detestable'},
  {'word': 'Abominate', 'meaning': 'find repugnant'},
  {'word': 'Aboriginal', 'meaning': 'having existed from the beginning'},
 

In [58]:
pickle.dump(pickle_storage,open('difficulty_to_words_map.pkl','wb'))

In [2]:
try:
    all_words = pickle.load(open('really_all_words.pkl','rb'))
except FileNotFoundError:
    all_words = []
len(all_words)

3626

In [3]:
base_dir = './raw_words'
file_list = os.listdir(base_dir)
file_list

['List01.txt',
 'word_list_magoosh.txt',
 'word_list_pylike.txt',
 'List05.txt',
 'List09.txt',
 'List08.txt',
 'List04.txt',
 'List07.txt',
 'List02.txt',
 'word_list_essential.txt',
 'words_with_meanings.json',
 'List06.txt',
 'List03.txt',
 'word_freq.json',
 'word_list.txt']

In [4]:
#List0X.txt
list_type_files = [filename for filename in file_list if 'List' in filename]
for file in list_type_files:
    with open(os.path.join(base_dir,file)) as f:
        output = [x for x in f.read().splitlines() if x!='' and x.startswith('/')==False]
    words = [line.strip().lower() for line in output if '.' not in line]
    all_words.extend(words)

In [5]:
#word_list_magoosh.txt
with open(os.path.join(base_dir,'word_list_magoosh.txt')) as f:
    output = f.read().splitlines()
    magoosh_words = [line.strip().lower() for line in output]
all_words.extend(magoosh_words)

In [6]:
#word_list.txt
with open(os.path.join(base_dir,'word_list.txt')) as f:
#     output = [x.strip() for x in f.read().splitlines() if len(x)!=0 and '(' not in x and ')' not in x and '*' not in x and '-' not in x and ':' not in x and '/' not in x]
    my_new_word_list = [x.strip().lower() for x in f.read().splitlines() if len(x.strip().split())==1 and '(' not in x and ')' not in x and  '-' not in x and '*' not in x and '.' not in x and not x.isupper()]
all_words.extend(my_new_word_list)

In [7]:
#words_with_meanings.json
with open(os.path.join(base_dir,'words_with_meanings.json')) as f:
    output = json.load(f)
for key in output:
    all_words.extend([x['word'].lower() for x in output[key]])
    

In [8]:
#word_freq.json
with open(os.path.join(base_dir,'word_freq.json')) as f:
    output = json.load(f)
fresher_word_list = [x for x in list(output.keys()) if len(x.split())==1]
all_words.extend(fresher_word_list)

In [9]:
#word_list_essential.txt
with open(os.path.join(base_dir,'word_list_essential.txt')) as f:
    my_new_word_list_essential = [x.strip().lower() for x in f.read().splitlines() if len(x.strip().split())==1 and '(' not in x and ')' not in x and  '-' not in x and '*' not in x and '.' not in x and not x.isupper()]
all_words.extend(my_new_word_list_essential)

In [10]:
#word_list_pylike.txt
with open(os.path.join(base_dir,'word_list_pylike.txt')) as f:
    pylike_word_list = [x.strip().split('\'')[1] for x in f.read().splitlines() if 'Word(' in x and len(x.strip().split('\''))>1]
all_words.extend([x.strip().lower() for x in pylike_word_list])

In [11]:
# TODO: This has frequency data

import requests
request_text = requests.get('https://www.vocabulary.com/lists/128536').text
try: 
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup
html = request_text
parsed_html = BeautifulSoup(html)
html_list = list(parsed_html.body.find_all('li', attrs={'class':'entry learnable'}))

large_af_word_list = [str(x).split('\"')[9] for x in html_list]
all_words.extend(large_af_word_list)

In [12]:
pickle.dump(list(set([word.lower() for word in all_words])),open('really_all_words.pkl','wb'))
all_words = pickle.load(open('really_all_words.pkl','rb'))
len(all_words)

6024

In [29]:
split_list = [str(x).split("\"") for x in html_list]
data = []
set_of_words = set()
for item in split_list:
    datapoint= {}
    datapoint['freq'] = float(item[3])
    datapoint['word'] = item[9].strip().lower()
    datapoint['meaning'] = item[16].split('>')[1].split('<')[0]
    before = len(set_of_words)
    set_of_words.add(datapoint['word'])
    after = len(set_of_words)
    if after==before:
        print('Duplicate',datapoint['word'],datapoint['word'] in set_of_words)
        continue
    data.append(datapoint)
pickle.dump(data,open('all_words_with_freq_and_meaning.pkl','wb'))

Duplicate appurtenance True
Duplicate bloat True
Duplicate cavil True
Duplicate loquacious True
Duplicate purport True
Duplicate qualm True
Duplicate vie True


In [None]:
# Commented code
#     word_dict = {}
#     value = []

#         word_dict[line] = tuple(value)
#         value = []
#         continue
#     tokens = line.split('.')
#     value.extend([x.strip() for x in tokens])