In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Reformatting data to fit our purposes

## Cleaning NYT data

In [3]:
nyt_df = pd.read_csv("data/nytcrosswords.csv", encoding='ISO-8859-1')

nyt_df

Unnamed: 0,Date,Word,Clue
0,10/31/2021,PAT,"Action done while saying ""Good dog"""
1,10/31/2021,RASCALS,Mischief-makers
2,10/31/2021,PEN,It might click for a writer
3,10/31/2021,SEP,Fall mo.
4,10/31/2021,ECO,Kind to Mother Nature
...,...,...,...
781568,11/21/1993,NAT,Actor Pendleton
781569,11/21/1993,SHRED,Bit
781570,11/21/1993,NEA,Teachers' org.
781571,11/21/1993,BEG,Petition


In [4]:
nyt_df = nyt_df.drop(labels=["Date"], axis=1)

In [5]:
nyt_df

Unnamed: 0,Word,Clue
0,PAT,"Action done while saying ""Good dog"""
1,RASCALS,Mischief-makers
2,PEN,It might click for a writer
3,SEP,Fall mo.
4,ECO,Kind to Mother Nature
...,...,...
781568,NAT,Actor Pendleton
781569,SHRED,Bit
781570,NEA,Teachers' org.
781571,BEG,Petition


## Cleaning Urban Dictionary

In [6]:
urb_df = pd.read_csv("data/urban_dictionary.csv")

urb_df

Unnamed: 0,definition,word,author,tags,up,down,date
0,When a city or town specifically zones an area...,Jizzneyland,whocaresaboutNY,"[u'#jisneyland', u'#gizzneyland', u'#adult zon...",6267,6382,"April 05, 2013"
1,A toilet bowl.,trump basket,jknightx,[],729,634,"March 23, 2017"
2,Excellent health care reserved exclusively for...,wealth care,Davis Finch,"[u'#insurance', u'#rich', u'#america', u'#capi...",1072,625,"May 16, 2008"
3,A dick thing you say to a tall person to piss ...,hows the weather up there,Jackalfu,[],1428,580,"March 20, 2017"
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker,FedupAngryLiberal,"[u'#trump', u'#asshat', u'#scum', u'#asshole',...",1580,2351,"March 18, 2017"
...,...,...,...,...,...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out,DJ FoxPhyre,[],1553,463,"March 26, 2003"
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome,Diggs,[],1199,1903,"February 17, 2003"
4269,Something socially unacceptable done in a soci...,party foul,Stroll,[],4392,824,"June 28, 2004"
4270,A night when groups of adults get drunk and us...,new year's eve,j-hi,[],3130,1251,"January 07, 2004"


In [7]:
urb_df = urb_df[(urb_df["up"] > 1000) & (any(chr.isdigit() for chr in urb_df["word"]))]

urb_df

Unnamed: 0,definition,word,author,tags,up,down,date
0,When a city or town specifically zones an area...,Jizzneyland,whocaresaboutNY,"[u'#jisneyland', u'#gizzneyland', u'#adult zon...",6267,6382,"April 05, 2013"
2,Excellent health care reserved exclusively for...,wealth care,Davis Finch,"[u'#insurance', u'#rich', u'#america', u'#capi...",1072,625,"May 16, 2008"
3,A dick thing you say to a tall person to piss ...,hows the weather up there,Jackalfu,[],1428,580,"March 20, 2017"
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker,FedupAngryLiberal,"[u'#trump', u'#asshat', u'#scum', u'#asshole',...",1580,2351,"March 18, 2017"
5,From the beginning.,from the giddy up,Chefpatrick,[],1344,685,"March 20, 2017"
...,...,...,...,...,...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out,DJ FoxPhyre,[],1553,463,"March 26, 2003"
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome,Diggs,[],1199,1903,"February 17, 2003"
4269,Something socially unacceptable done in a soci...,party foul,Stroll,[],4392,824,"June 28, 2004"
4270,A night when groups of adults get drunk and us...,new year's eve,j-hi,[],3130,1251,"January 07, 2004"


In [8]:
urb_df = urb_df.drop(labels=["author", "tags", "up", "down", "date"], axis=1)

urb_df

Unnamed: 0,definition,word
0,When a city or town specifically zones an area...,Jizzneyland
2,Excellent health care reserved exclusively for...,wealth care
3,A dick thing you say to a tall person to piss ...,hows the weather up there
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker
5,From the beginning.,from the giddy up
...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome
4269,Something socially unacceptable done in a soci...,party foul
4270,A night when groups of adults get drunk and us...,new year's eve


In [9]:
def fix_word(word):
    word = word.upper()
    word = re.sub('[^A-Z]', '', word)
    
    return word

urb_df["word"] = urb_df["word"].apply(fix_word)
urb_df = urb_df[(urb_df["word"].str.len() >= 3)]

urb_df

Unnamed: 0,definition,word
0,When a city or town specifically zones an area...,JIZZNEYLAND
2,Excellent health care reserved exclusively for...,WEALTHCARE
3,A dick thing you say to a tall person to piss ...,HOWSTHEWEATHERUPTHERE
4,"The insane, narcissistic, outrageous asshat wh...",RATFUCKER
5,From the beginning.,FROMTHEGIDDYUP
...,...,...
4267,Verb. Etymology: coming from the modern car tu...,TRICKOUT
4268,"(adjective, adverb, interjection) awesome; coi...",JAWSOME
4269,Something socially unacceptable done in a soci...,PARTYFOUL
4270,A night when groups of adults get drunk and us...,NEWYEARSEVE


In [10]:
urb_df["Word"] = urb_df["word"]
urb_df = urb_df.drop(labels=["word"], axis=1)

urb_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  urb_df["Word"] = urb_df["word"]


Unnamed: 0,definition,Word
0,When a city or town specifically zones an area...,JIZZNEYLAND
2,Excellent health care reserved exclusively for...,WEALTHCARE
3,A dick thing you say to a tall person to piss ...,HOWSTHEWEATHERUPTHERE
4,"The insane, narcissistic, outrageous asshat wh...",RATFUCKER
5,From the beginning.,FROMTHEGIDDYUP
...,...,...
4267,Verb. Etymology: coming from the modern car tu...,TRICKOUT
4268,"(adjective, adverb, interjection) awesome; coi...",JAWSOME
4269,Something socially unacceptable done in a soci...,PARTYFOUL
4270,A night when groups of adults get drunk and us...,NEWYEARSEVE


## Cleaning Dictionary

In [11]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [12]:
dict_df = dict_df.drop(axis=1, labels=["POS"])
dict_df = dict_df.dropna()

dict_df

Unnamed: 0,Word,Definition
0,A,The first letter of the English and of many ot...
1,A,The name of the sixth tone in the model major ...
2,A,An adjective commonly called the indefinite ar...
3,A,"In each; to or for each; as """"""""twenty leagues..."
4,A,In; on; at; by.
...,...,...
175718,Zymotic,Of pertaining to or caused by fermentation.
175719,Zymotic,Designating or pertaining to a certain class o...
175720,Zythem,See Zythum.
175721,Zythepsary,A brewery.


In [13]:
dict_df["Word"] = dict_df["Word"].apply(fix_word)
dict_df = dict_df[dict_df["Word"].str.len() >= 3]

dict_df

Unnamed: 0,Word,Definition
11,AAM,A Dutch and German measure of liquids varying ...
12,AARDVARK,An edentate mammal of the genus Orycteropus so...
13,AARDWOLF,A carnivorous quadruped (Proteles Lalandii) of...
14,AARONIC,Alt. of Aaronical
15,AARONICAL,Pertaining to Aaron the first high priest of t...
...,...,...
175718,ZYMOTIC,Of pertaining to or caused by fermentation.
175719,ZYMOTIC,Designating or pertaining to a certain class o...
175720,ZYTHEM,See Zythum.
175721,ZYTHEPSARY,A brewery.


# Classify Words

In [14]:
words = pd.unique(pd.concat([nyt_df["Word"], dict_df["Word"], urb_df["Word"]], axis=0))

word_dict = {}

for i, word in enumerate(words):
    word_dict[word] = i
    
def find_in_dict(word):
    if word in word_dict:
        return word_dict[word]
    return -1
    
nyt_df["target"] = nyt_df["Word"].apply(find_in_dict)
urb_df["target"] = urb_df["Word"].apply(find_in_dict)
dict_df["target"] = dict_df["Word"].apply(find_in_dict)

dict_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_df["target"] = dict_df["Word"].apply(find_in_dict)


Unnamed: 0,Word,Definition,target
11,AAM,A Dutch and German measure of liquids varying ...,63314
12,AARDVARK,An edentate mammal of the genus Orycteropus so...,63315
13,AARDWOLF,A carnivorous quadruped (Proteles Lalandii) of...,63316
14,AARONIC,Alt. of Aaronical,63317
15,AARONICAL,Pertaining to Aaron the first high priest of t...,63318
...,...,...,...
175718,ZYMOTIC,Of pertaining to or caused by fermentation.,155027
175719,ZYMOTIC,Designating or pertaining to a certain class o...,155027
175720,ZYTHEM,See Zythum.,155028
175721,ZYTHEPSARY,A brewery.,155029


# Vectorize clues/definitions

In [15]:
port_stem = PorterStemmer()
stop = stopwords.words("english")

In [16]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # remove non alphabetical characters
    stemmed_content = stemmed_content.lower() # convert all to lower
    stemmed_content = stemmed_content.split() # convert to word list
    
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop] # stem all words except those that are stopwords
    
    return ' '.join(stemmed_content) # return reconstructed list as a string

In [17]:
nyt_df["stemmed_clue"] = nyt_df["Clue"].apply(stemming)

nyt_df

Unnamed: 0,Word,Clue,target,stemmed_clue
0,PAT,"Action done while saying ""Good dog""",0,action done say good dog
1,RASCALS,Mischief-makers,1,mischief maker
2,PEN,It might click for a writer,2,might click writer
3,SEP,Fall mo.,3,fall mo
4,ECO,Kind to Mother Nature,4,kind mother natur
...,...,...,...,...
781568,NAT,Actor Pendleton,4369,actor pendleton
781569,SHRED,Bit,5871,bit
781570,NEA,Teachers' org.,2962,teacher org
781571,BEG,Petition,13115,petit


In [18]:
urb_df["stemmed_clue"] = urb_df["definition"].apply(stemming)

urb_df

Unnamed: 0,definition,Word,target,stemmed_clue
0,When a city or town specifically zones an area...,JIZZNEYLAND,155031,citi town specif zone area adult busi
2,Excellent health care reserved exclusively for...,WEALTHCARE,155032,excel health care reserv exclus wealthi
3,A dick thing you say to a tall person to piss ...,HOWSTHEWEATHERUPTHERE,155033,dick thing say tall person piss
4,"The insane, narcissistic, outrageous asshat wh...",RATFUCKER,155034,insan narcissist outrag asshat ratfuck way whi...
5,From the beginning.,FROMTHEGIDDYUP,155035,begin
...,...,...,...,...
4267,Verb. Etymology: coming from the modern car tu...,TRICKOUT,158006,verb etymolog come modern car tune scene phras...
4268,"(adjective, adverb, interjection) awesome; coi...",JAWSOME,158035,adject adverb interject awesom coin street sha...
4269,Something socially unacceptable done in a soci...,PARTYFOUL,29788,someth social unaccept done social gather
4270,A night when groups of adults get drunk and us...,NEWYEARSEVE,157545,night group adult get drunk use explos intox c...


In [19]:
dict_df["stemmed_clue"] = dict_df["Definition"].apply(stemming)

dict_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dict_df["stemmed_clue"] = dict_df["Definition"].apply(stemming)


Unnamed: 0,Word,Definition,target,stemmed_clue
11,AAM,A Dutch and German measure of liquids varying ...,63314,dutch german measur liquid vari differ citi am...
12,AARDVARK,An edentate mammal of the genus Orycteropus so...,63315,edent mammal genu orycteropu somewhat resembl ...
13,AARDWOLF,A carnivorous quadruped (Proteles Lalandii) of...,63316,carnivor quadrup protel lalandii south africa ...
14,AARONIC,Alt. of Aaronical,63317,alt aaron
15,AARONICAL,Pertaining to Aaron the first high priest of t...,63318,pertain aaron first high priest jew
...,...,...,...,...
175718,ZYMOTIC,Of pertaining to or caused by fermentation.,155027,pertain caus ferment
175719,ZYMOTIC,Designating or pertaining to a certain class o...,155027,design pertain certain class diseas see zymot ...
175720,ZYTHEM,See Zythum.,155028,see zythum
175721,ZYTHEPSARY,A brewery.,155029,breweri


In [20]:
vectorizer = TfidfVectorizer()

In [21]:
nyt_df_vecs = vectorizer.fit_transform(nyt_df["stemmed_clue"])
dict_df_vecs = vectorizer.transform(dict_df["stemmed_clue"])
urb_df_vecs = vectorizer.transform(urb_df["stemmed_clue"])