In [83]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
import math
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vincentzhao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [85]:
dict_df = pd.read_csv("data/dictionary.csv")

dict_df

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.
...,...,...,...
175718,Zymotic,a.,Of pertaining to or caused by fermentation.
175719,Zymotic,a.,Designating or pertaining to a certain class o...
175720,Zythem,n.,See Zythum.
175721,Zythepsary,n.,A brewery.


In [86]:
words = [
    "CRAB", "RAY", "SPONGE", "SQUID", 
    "CIRCLE", "DIAMOND", "SQUARE", "TRIANGLE",
    "BOB", "CROSS", "HOOK", "WEAVE",
    "FEAST", "FREE", "PANTS", "THAT"
]

In [87]:
dict_df["Word"] = dict_df["Word"].str.upper()

In [88]:
dict_df = dict_df[dict_df["Word"].isin(words)]

dict_df = dict_df.reset_index()
dict_df

Unnamed: 0,index,Word,POS,Definition
0,17498,BOB,n.,Anything that hangs so as to play loosely or w...
1,17499,BOB,n.,A knot of worms or of rags on a string used in...
2,17500,BOB,n.,A small piece of cork or light wood attached t...
3,17501,BOB,n.,The ball or heavy part of a pendulum; also the...
4,17502,BOB,n.,A small wheel made of leather with rounded edg...
...,...,...,...,...
202,171769,WEAVE,v. t.,To unite as threads of any kind in such a mann...
203,171770,WEAVE,v. t.,To form as cloth by interlacing threads; to co...
204,171771,WEAVE,v. i.,To practice weaving; to work with a loom.
205,171772,WEAVE,v. i.,To become woven or interwoven.


In [89]:
port_stem = PorterStemmer()
lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
stop = stopwords.words("english")

In [90]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) # remove non alphabetical characters
    stemmed_content = stemmed_content.lower() # convert all to lower
    stemmed_content = stemmed_content.split() # convert to word list
    
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stop] # stem all words except those that are stopwords
    
    return ' '.join(stemmed_content) # return reconstructed list as a string

In [91]:
dict_df["stemmed"] = dict_df["Definition"].apply(stemming)

dict_df

Unnamed: 0,index,Word,POS,Definition,stemmed
0,17498,BOB,n.,Anything that hangs so as to play loosely or w...,anyth hang play loos short abrupt motion end s...
1,17499,BOB,n.,A knot of worms or of rags on a string used in...,knot worm rag string use angl eel formerli wor...
2,17500,BOB,n.,A small piece of cork or light wood attached t...,small piec cork light wood attach fish line sh...
3,17501,BOB,n.,The ball or heavy part of a pendulum; also the...,ball heavi part pendulum also ball weight end ...
4,17502,BOB,n.,A small wheel made of leather with rounded edg...,small wheel made leather round edg use polish ...
...,...,...,...,...,...
202,171769,WEAVE,v. t.,To unite as threads of any kind in such a mann...,unit thread kind manner form textur entwin int...
203,171770,WEAVE,v. t.,To form as cloth by interlacing threads; to co...,form cloth interlac thread compos textur kind ...
204,171771,WEAVE,v. i.,To practice weaving; to work with a loom.,practic weav work loom
205,171772,WEAVE,v. i.,To become woven or interwoven.,becom woven interwoven


In [92]:
dict_df[dict_df["Word"] == "CROSS"]

Unnamed: 0,index,Word,POS,Definition,stemmed
46,36503,CROSS,n.,A gibbet consisting of two pieces of timber pl...,gibbet consist two piec timber place transvers...
47,36504,CROSS,n.,The sign or mark of the cross made with the fi...,sign mark cross made finger ink etc actual rep...
48,36505,CROSS,n.,Affiction regarded as a test of patience or vi...,affict regard test patienc virtu trial disappo...
49,36506,CROSS,n.,A piece of money stamped with the figure of a ...,piec money stamp figur cross also side piec cr...
50,36507,CROSS,n.,An appendage or ornament or anything in the fo...,appendag ornament anyth form cross badg orname...
51,36508,CROSS,n.,A monument in the form of a cross or surmounte...,monument form cross surmount cross set public ...
52,36509,CROSS,n.,A common heraldic bearing of which there are m...,common herald bear mani varieti see illustr
53,36510,CROSS,n.,The crosslike mark or symbol used instead of a...,crosslik mark symbol use instead signatur unab...
54,36511,CROSS,n.,Church lands.,church land
55,36512,CROSS,n.,A line drawn across or through another line.,line drawn across anoth line


In [93]:
vectorizer = TfidfVectorizer()

In [94]:
dict_df_vecs = vectorizer.fit_transform(dict_df["stemmed"])

dict_df_vecs.shape

(207, 954)

In [95]:
matrix = dict_df_vecs.toarray()

In [96]:
def cosine_similarity(a, b):
    return np.dot(a,b)/(norm(a)*norm(b))

In [103]:
similarities = []

for i in range(len(matrix)):
    a = matrix[i]
    for j in range(i, len(matrix)):
        b = matrix[j]
        if dict_df.iloc[i]["Word"] != dict_df.iloc[j]["Word"]:
            similarities.append([dict_df.iloc[i]["Word"], dict_df.iloc[j]["Word"], cosine_similarity(a, b)/math.dist(a, b)])
            
df = pd.DataFrame(similarities, columns=["word_1", "word_2", "similarity"])

df

Unnamed: 0,word_1,word_2,similarity
0,BOB,CIRCLE,0.0
1,BOB,CIRCLE,0.0
2,BOB,CIRCLE,0.0
3,BOB,CIRCLE,0.0
4,BOB,CIRCLE,0.0
...,...,...,...
19235,TRIANGLE,WEAVE,0.0
19236,TRIANGLE,WEAVE,0.0
19237,TRIANGLE,WEAVE,0.0
19238,TRIANGLE,WEAVE,0.0


In [98]:
df = df.groupby(['word_1', 'word_2'])['similarity'].max().reset_index()

df

Unnamed: 0,word_1,word_2,similarity
0,BOB,CIRCLE,0.210082
1,BOB,CRAB,0.117265
2,BOB,CROSS,0.180934
3,BOB,DIAMOND,0.183026
4,BOB,FEAST,0.000000
...,...,...,...
86,SQUARE,TRIANGLE,0.540394
87,SQUARE,WEAVE,0.178597
88,SQUID,TRIANGLE,0.078060
89,SQUID,WEAVE,0.000000


In [104]:
df = df.drop_duplicates()
df = df.dropna()
df = df.sort_values(by="similarity", ascending=False)

df

Unnamed: 0,word_1,word_2,similarity
18958,SQUARE,TRIANGLE,0.540394
9760,CROSS,SQUARE,0.449716
4075,CIRCLE,TRIANGLE,0.321006
9752,CROSS,SQUARE,0.305237
9736,CROSS,SQUARE,0.300649
...,...,...,...
3752,CIRCLE,CROSS,0.000000
16021,HOOK,SPONGE,0.000000
3740,CIRCLE,CRAB,0.000000
16041,HOOK,SQUARE,0.000000


In [105]:
relation_dict = {}

for i, n in df.iterrows():
    word1 = n["word_1"]
    word2 = n["word_2"]
    
    key1 = (word1, word2)
    key2 = (word2, word1)
    
    relation_dict[key1] = n["similarity"]
    relation_dict[key2] = n["similarity"]

relation_dict

{('SQUARE', 'TRIANGLE'): 0.0,
 ('TRIANGLE', 'SQUARE'): 0.0,
 ('CROSS', 'SQUARE'): 0.0,
 ('SQUARE', 'CROSS'): 0.0,
 ('CIRCLE', 'TRIANGLE'): 0.0,
 ('TRIANGLE', 'CIRCLE'): 0.0,
 ('DIAMOND', 'SQUARE'): 0.0,
 ('SQUARE', 'DIAMOND'): 0.0,
 ('CROSS', 'HOOK'): 0.0,
 ('HOOK', 'CROSS'): 0.0,
 ('CROSS', 'DIAMOND'): 0.0,
 ('DIAMOND', 'CROSS'): 0.0,
 ('BOB', 'WEAVE'): 0.0,
 ('WEAVE', 'BOB'): 0.0,
 ('BOB', 'CIRCLE'): 0.0,
 ('CIRCLE', 'BOB'): 0.0,
 ('CROSS', 'TRIANGLE'): 0.0,
 ('TRIANGLE', 'CROSS'): 0.0,
 ('BOB', 'HOOK'): 0.0,
 ('HOOK', 'BOB'): 0.0,
 ('SPONGE', 'SQUID'): 0.0,
 ('SQUID', 'SPONGE'): 0.0,
 ('CROSS', 'RAY'): 0.0,
 ('RAY', 'CROSS'): 0.0,
 ('BOB', 'DIAMOND'): 0.0,
 ('DIAMOND', 'BOB'): 0.0,
 ('CIRCLE', 'SQUARE'): 0.0,
 ('SQUARE', 'CIRCLE'): 0.0,
 ('BOB', 'CROSS'): 0.0,
 ('CROSS', 'BOB'): 0.0,
 ('SQUARE', 'WEAVE'): 0.0,
 ('WEAVE', 'SQUARE'): 0.0,
 ('CRAB', 'TRIANGLE'): 0.0,
 ('TRIANGLE', 'CRAB'): 0.0,
 ('CIRCLE', 'CROSS'): 0.0,
 ('CROSS', 'CIRCLE'): 0.0,
 ('DIAMOND', 'RAY'): 0.0,
 ('RAY', 'DI

In [102]:
sim_4 = []

def similarity_4(a, b, c, d):
    return relation_dict[(a, b)] + relation_dict[(a, c)] + relation_dict[(a, d)] + relation_dict[(b, c)] + relation_dict[(b, d)] + relation_dict[(c, d)]

for i, a in enumerate(words):
    for j in range(i + 1, len(words)):
        b = words[j]
        for k in range(j + 1, len(words)):
            c = words[k]
            for l in range(k + 1, len(words)):
                d = words[l]
                try:
                    score = similarity_4(a, b, c, d)
                        
                    index = 0
                    
                    while index < len(sim_4) and score < sim_4[index][1]:
                        index += 1
                    sim_4.insert(index, ([a, b, c, d], score))
                except:
                    pass
                    

sim_4

[(['CIRCLE', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.867036201821259),
 (['DIAMOND', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.8402101499083239),
 (['SQUARE', 'TRIANGLE', 'CROSS', 'HOOK'], 1.670987426315047),
 (['CIRCLE', 'DIAMOND', 'SQUARE', 'TRIANGLE'], 1.6147300731682868),
 (['SQUARE', 'TRIANGLE', 'CROSS', 'WEAVE'], 1.6123646806757708),
 (['SQUARE', 'TRIANGLE', 'BOB', 'CROSS'], 1.61142246823822),
 (['CRAB', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.5886868535209695),
 (['RAY', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.5864838350449735),
 (['SQUARE', 'TRIANGLE', 'CROSS', 'FREE'], 1.5433953138319085),
 (['SPONGE', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.5217226393735364),
 (['CIRCLE', 'SQUARE', 'TRIANGLE', 'BOB'], 1.4909264818061763),
 (['DIAMOND', 'SQUARE', 'BOB', 'CROSS'], 1.4806607950431139),
 (['DIAMOND', 'SQUARE', 'CROSS', 'HOOK'], 1.4738107747578741),
 (['CRAB', 'CIRCLE', 'SQUARE', 'TRIANGLE'], 1.4644996988505776),
 (['SQUID', 'SQUARE', 'TRIANGLE', 'CROSS'], 1.4628007090902488),
 (['CIRCLE', 'DIAMOND', 'SQUARE', '