Import files

In [1]:
import collections
import json
import re
import string
import nltk
import http.client
import aiohttp
import asyncio, xmlrpc.client
path = "in/freakonomics.txt"
out_path = "freq_table.json"
definitions_path = "defs.json"
sentences_path = "sentences.txt"
datayze_url = "https://datayze.com"
extra_defs = "extra_defs.txt"

wordCount = 10
thresholdFreq = 5
word_definitions = {}
tasks = []
wordsToLookup = set()
wordsLeftToLookup = set()
extra_words = []
server = xmlrpc.client.ServerProxy(
    "http://localhost:{}".format(8000))

Preprocess

In [2]:

f = open(path, "r")
content = f.read()
sentences = nltk.sent_tokenize(content.replace("\n", " "))

res = [x.lower().encode("ascii", "ignore").decode("ascii") for x in re.sub('['+string.punctuation+']', '', content).split() if x.strip() != '']
freq = dict(collections.Counter(res))


Word lookup

In [3]:

def getWord(k, v):
    global wordCount
    if v > thresholdFreq: return
    word_rank = server.predict(k)
    print(f"Looking {k}: #{word_rank}")
    if word_rank < 0.4: return
    wordsToLookup.add(k)
async def getWordDefinition(session : aiohttp.ClientSession, k):
    async with session.get(f"https://api.dictionaryapi.dev/api/v2/entries/en/{k}") as response:
        print("Status: {} and reason: {}".format(response.status, response.reason))
        res = await response.json()
        if (response.status == 404): return
        res_def = res[0]
        word_definitions[k] = res_def
        wordsLeftToLookup.remove(k)

def generateLookupWords():
    for k, v in freq.items():
        getWord(k, v)

def resetLookup():
    global wordsLeftToLookup
    wordsLeftToLookup = set(wordsToLookup)
async def getFromWeb():
    try:
        connector = aiohttp.TCPConnector(limit=3, limit_per_host=2)
        mysession = aiohttp.ClientSession(connector=connector)
        async with mysession as session:
            for k in wordsLeftToLookup:
                tasks.append(asyncio.ensure_future(getWordDefinition(session, k)))
            res = await asyncio.gather(*tasks)
    except Exception as e:
        print(e)
    # Save it
    dictf = open(definitions_path, "w")
    dictf.write(json.dumps(word_definitions))
    dictf.close()

In [4]:
generateLookupWords()

Looking grave: #0.029892692927184404
Looking doubts: #0.029892692927184404
Looking itand: #0.44736842110294095
Looking envisioned: #0.04552793030061346
Looking expanded: #0.029892692927184404
Looking grateful: #0.029892692927184404
Looking breathing: #0.029892692927184404
Looking changing: #0.008615845672597844
Looking whereas: #0.029892692927184404
Looking manuscript: #0.029892692927184404
Looking finished: #0.029892692927184404
Looking sits: #0.008615845672597844
Looking publisher: #0.029892692927184404
Looking debut: #0.029892692927184404
Looking pose: #0.029892692927184404
Looking punic: #0.44736842110294095
Looking explores: #0.04552793030061346
Looking sorts: #0.029892692927184404
Looking realworld: #0.23355263168749998
Looking minor: #0.029892692927184404
Looking updates: #0.04552793030061346
Looking mistakes: #0.029892692927184404
Looking mistake: #0.008615845672597844
Looking appreciate: #0.008615845672597844
Looking input: #0.029892692927184404
Looking aggressively: #0.045527

In [5]:
def addWordsRoutine():
    global extra_words
    extra_words = open(extra_defs, "r").read().split(",")
    map(wordsLeftToLookup.add, extra_words)
    word=input("Add a word (q to quit): ")
    while word != "q":
        wordsToLookup.add(word)
        wordsLeftToLookup.add(word)
        extra_words.append(word)
        word=input("Add a word (q to quit): ")
    with open(extra_defs, "w") as w:
        w.write(",".join(extra_words))
addWordsRoutine()

In [6]:
resetLookup()

In [7]:
# import openai
# import random
# SECRET_KEY = open("secret/openai_key.txt", "r").read()
# openai.api_key = SECRET_KEY
# wordsPrompt = " ".join(random.sample(wordsLeftToLookup, 50))
# completion = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "system", "content": "You are an english advanced words extractor. \
#      You should take in a list of words and remove any simple words (like 'this', 'rust', etc.), leaving only the advanced ones (e.g. 'arpeggio', 'voracity', etc.). \
#      You should convert all words into singular present simple tense but maintaining the original word \
#      and remove any duplicate words. Do not convert words into their meanings. \
#      You should then output the processed words separated by space. \
#      You should not elaborate on anything and only output words separated by space \
#      without any punctuations like comma or period."},
#     {"role": "user", "content": wordsPrompt}
#   ]
# )
# print(wordsPrompt)
# print(completion.choices[0].message.content)


In [8]:
await getFromWeb()

Status: 200 and reason: OK
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 200 and reason: OK
Status: 200 and reason: OK
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 200 and reason: OK
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 200 and reason: OK
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 200 and reason: OK
Status: 404 and reason: Not Found
Status: 404 and reason: Not Found
Status: 200 and reason: OK
Status: 200 and reason: OK
St

Load cached file

In [9]:
dictf = open(definitions_path, "r")
word_definitions = json.loads(dictf.read())
dictf.close()

In [10]:
def trunc(s: str, target: str, l: int):
    if len(s) <= l: return s
    i = s.find(target)
    if i == -1: return s
    left = s.rfind(' ', 0, i - l) if i - l >= 0 else 0
    right = s.find(' ', i + l) if i + l <= len(s) else len(s)
    # print(i, left, right)
    if left == -1: left = 0
    if right == -1: right = len(s)
    return f"{'... ' if left != 0 else ''}{s[left:right]}{' ...' if right != len(s) else ''}"
print(trunc("Hello world! This is a target word test.", "is", 10))
def example_sentence(s: str, k: str):
    return trunc(s, k, 50).replace(k, f'*{k}*')
def get_page(k: str):
    containedSentence = [(i, s) for i, s in enumerate(sentences) if k in s.lower()]
    if len(containedSentence) > 0:
        return int(containedSentence[0][0] / 10) + 1
    return -1

Hello world! This is a target ...


Generate

In [11]:
sortedDefs = sorted(word_definitions.items(), key=lambda x: get_page(x[0]))
for k, v in sortedDefs:
    containedSentence = [s for s in sentences if k in s.lower()]
    phonetic = f" *{v['phonetic']}*" if "phonetic" in v.keys() else ""
    page_num = get_page(k)
    page = f" Page {page_num}" if page_num != -1 else ""
    print(f"**{k}**{phonetic}{page}:  ")
    if len(containedSentence) > 0:
        print(f'* "{example_sentence(containedSentence[0], k)}"  ')
    for meaning in v['meanings']:
        maxMeanings = 3
        form = meaning["partOfSpeech"]
        print(f"* {form}:  ")
        for definition in meaning["definitions"]:
            print(f"  * {definition['definition']}  ")
            if "example" in definition.keys():
                print(f"    E.g. \"{example_sentence(definition['example'], k)}\"  ")
            maxMeanings -= 1
            if maxMeanings <= 0: break
    print()


**lifesize**:  
* adjective:  
  * Of any representation, especially pictures, statues: The same size as the real thing.  
    E.g. "Many consider the life-size painting of King Peter to be one of the artist's best works."  

**semiskilled**:  
* adjective:  
  * Requiring only minimal levels of training.  
    E.g. "Semiskilled workers are cheaper than skilled workers but more productive than unskilled drudges."  

**setpoint** */ˈsɛtpɔɪnt/*:  
* noun:  
  * The command signal or value which is fed into a controller to establish the target or desired position or state of the controlled device or process.  

**tipoff**:  
* noun:  
  * An obvious clue or indication.  
    E.g. "The broken window and overturned plant pots were a tip-off that something was wrong."  
  * A report of suspicious behaviour, especially to an authority.  
    E.g. "The police received a tip-off about a recent bank robbery."  

**knowhow**:  
* noun:  
  * The knowledge and skill to be able to (do something cor