In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Reformatting data to fit our purposes

## Cleaning NYT data

In [7]:
nyt_df = pd.read_csv("data/nytcrosswords.csv", encoding='ISO-8859-1')

nyt_df

Unnamed: 0,Date,Word,Clue
0,10/31/2021,PAT,"Action done while saying ""Good dog"""
1,10/31/2021,RASCALS,Mischief-makers
2,10/31/2021,PEN,It might click for a writer
3,10/31/2021,SEP,Fall mo.
4,10/31/2021,ECO,Kind to Mother Nature
...,...,...,...
781568,11/21/1993,NAT,Actor Pendleton
781569,11/21/1993,SHRED,Bit
781570,11/21/1993,NEA,Teachers' org.
781571,11/21/1993,BEG,Petition


In [15]:
nyt_df = nyt_df.drop(labels=["Date"], axis=1)

In [14]:
nyt_df

Unnamed: 0,Date,Word,Clue
0,10/31/2021,PAT,"Action done while saying ""Good dog"""
1,10/31/2021,RASCALS,Mischief-makers
2,10/31/2021,PEN,It might click for a writer
3,10/31/2021,SEP,Fall mo.
4,10/31/2021,ECO,Kind to Mother Nature
...,...,...,...
781568,11/21/1993,NAT,Actor Pendleton
781569,11/21/1993,SHRED,Bit
781570,11/21/1993,NEA,Teachers' org.
781571,11/21/1993,BEG,Petition


## Cleaning Urban Dictionary

In [62]:
urb_df = pd.read_csv("data/urban_dictionary.csv")

urb_df

Unnamed: 0,definition,word,author,tags,up,down,date
0,When a city or town specifically zones an area...,Jizzneyland,whocaresaboutNY,"[u'#jisneyland', u'#gizzneyland', u'#adult zon...",6267,6382,"April 05, 2013"
1,A toilet bowl.,trump basket,jknightx,[],729,634,"March 23, 2017"
2,Excellent health care reserved exclusively for...,wealth care,Davis Finch,"[u'#insurance', u'#rich', u'#america', u'#capi...",1072,625,"May 16, 2008"
3,A dick thing you say to a tall person to piss ...,hows the weather up there,Jackalfu,[],1428,580,"March 20, 2017"
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker,FedupAngryLiberal,"[u'#trump', u'#asshat', u'#scum', u'#asshole',...",1580,2351,"March 18, 2017"
...,...,...,...,...,...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out,DJ FoxPhyre,[],1553,463,"March 26, 2003"
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome,Diggs,[],1199,1903,"February 17, 2003"
4269,Something socially unacceptable done in a soci...,party foul,Stroll,[],4392,824,"June 28, 2004"
4270,A night when groups of adults get drunk and us...,new year's eve,j-hi,[],3130,1251,"January 07, 2004"


In [63]:
urb_df = urb_df[(urb_df["up"] > 1000) & (len(urb_df["word"]) >= 3) & (any(chr.isdigit() for chr in urb_df["word"]))]

urb_df

Unnamed: 0,definition,word,author,tags,up,down,date
0,When a city or town specifically zones an area...,Jizzneyland,whocaresaboutNY,"[u'#jisneyland', u'#gizzneyland', u'#adult zon...",6267,6382,"April 05, 2013"
2,Excellent health care reserved exclusively for...,wealth care,Davis Finch,"[u'#insurance', u'#rich', u'#america', u'#capi...",1072,625,"May 16, 2008"
3,A dick thing you say to a tall person to piss ...,hows the weather up there,Jackalfu,[],1428,580,"March 20, 2017"
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker,FedupAngryLiberal,"[u'#trump', u'#asshat', u'#scum', u'#asshole',...",1580,2351,"March 18, 2017"
5,From the beginning.,from the giddy up,Chefpatrick,[],1344,685,"March 20, 2017"
...,...,...,...,...,...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out,DJ FoxPhyre,[],1553,463,"March 26, 2003"
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome,Diggs,[],1199,1903,"February 17, 2003"
4269,Something socially unacceptable done in a soci...,party foul,Stroll,[],4392,824,"June 28, 2004"
4270,A night when groups of adults get drunk and us...,new year's eve,j-hi,[],3130,1251,"January 07, 2004"


In [64]:
urb_df = urb_df.drop(labels=["author", "tags", "up", "down", "date"], axis=1)

urb_df

Unnamed: 0,definition,word
0,When a city or town specifically zones an area...,Jizzneyland
2,Excellent health care reserved exclusively for...,wealth care
3,A dick thing you say to a tall person to piss ...,hows the weather up there
4,"The insane, narcissistic, outrageous asshat wh...",ratfucker
5,From the beginning.,from the giddy up
...,...,...
4267,Verb. Etymology: coming from the modern car tu...,Trick Out
4268,"(adjective, adverb, interjection) awesome; coi...",jawsome
4269,Something socially unacceptable done in a soci...,party foul
4270,A night when groups of adults get drunk and us...,new year's eve


In [65]:
def fix_word(word):
    word = word.upper()
    word = re.sub('[^A-Z]', '', word)
    
    return word

urb_df["word"] = urb_df["word"].apply(fix_word)

urb_df

Unnamed: 0,definition,word
0,When a city or town specifically zones an area...,JIZZNEYLAND
2,Excellent health care reserved exclusively for...,WEALTHCARE
3,A dick thing you say to a tall person to piss ...,HOWSTHEWEATHERUPTHERE
4,"The insane, narcissistic, outrageous asshat wh...",RATFUCKER
5,From the beginning.,FROMTHEGIDDYUP
...,...,...
4267,Verb. Etymology: coming from the modern car tu...,TRICKOUT
4268,"(adjective, adverb, interjection) awesome; coi...",JAWSOME
4269,Something socially unacceptable done in a soci...,PARTYFOUL
4270,A night when groups of adults get drunk and us...,NEWYEARSEVE


## Cleaning Dictionary

In [77]:
dict_df = pd.read_csv("data/dictionary.csv")

urb_df.dtypes

definition    object
word          object
dtype: object

In [80]:
dict_df = dict_df[(len(dict_df["Word"]) >= 3)]

dict_df

ValueError: "len" is not a supported function