In [132]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [82]:
emma = nltk.corpus.gutenberg.sents('austen-emma.txt')

emma_sentences = []
emma_word_set = []

for sentence in emma:
    emma_sentences.append([word.lower() for word in sentence if word.isalpha()])
    for word in sentence:
        if word.isalpha():
            emma_word_set.append(word.lower())

emma_word_set = set(emma_word_set)

In [105]:
def TermFreq(document, word):
    doc_length = len(document)
    occurances = len([w for w in document if w == word])
    return occurances / doc_length

TF(emma_sentences[5], 'ago')

0.024390243902439025

In [85]:
def build_DF_dict():
    output = {}
    for word in emma_word_set:
        output[word] = 0
        for doc in emma_sentences:
            if word in doc:
                output[word] += 1
    return output
        
df_dict = build_DF_dict()

df_dict['ago']

In [86]:
def InverseDocumentFrequency(word):
    N = len(emma_sentences)
    try:
        df = df_dict[word] + 1
    except:
        df = 1
    return np.log(N/df)

InverseDocumentFrequency('ago')

In [113]:
def TFIDF(doc, word):
    tf = TF(doc, word)
    idf = InverseDocumentFrequency(word)
    return tf*idf

print('ago - ' + str(TFIDF(emma_sentences[5],'ago')))
print('indistinct - ' + str(TFIDF(emma_sentences[5],'indistinct')))

ago - 0.13315118517327126
indistinct - 0.20152582861001603


In [114]:
def loadGlove(path):
    file = open(path,'r')
    model = {}
    for l in file:
        line = l.split()
        word = line[0]
        value = np.array([float(val) for val in line[1:]])
        model[word] = value
    return model

glove = loadGlove('glove.6B.50d.txt')

In [146]:
embeddings = []

for word in emma_sentences[5]:
    embeddings.append(glove[word])

mean_embedding = np.mean(embeddings, axis = 0).reshape(1, -1)

print(mean_embedding)

[[ 3.32575634e-01  3.16596488e-01 -1.80050732e-01 -3.82070951e-01
   4.98493527e-01  5.33804805e-01 -5.46517073e-01  9.12476195e-02
  -1.31538483e-01 -2.71967805e-02  2.99867317e-02  2.64278024e-02
  -2.06519756e-01 -1.54796634e-01  4.28036366e-01 -5.74977317e-02
  -2.65928778e-01  1.60373902e-02 -2.84913561e-01 -2.01252268e-01
  -5.96390732e-02  5.72458220e-01  2.06195927e-01 -1.54312293e-01
   2.52049805e-01 -1.64638200e+00 -3.42686049e-01  1.02592522e-01
   1.42848000e-01 -1.09779902e-01  2.89345488e+00  7.36985634e-02
  -3.73648780e-03 -2.76292784e-01  1.50580049e-01  9.80399951e-02
   2.24408780e-03  2.83664024e-01  3.92979024e-02 -2.98091634e-01
  -1.17309171e-01  2.08815776e-01  6.89953902e-03  2.92777244e-02
   5.54180122e-02 -2.20519707e-01 -2.82007805e-01 -4.34917439e-01
  -9.69051537e-02 -1.67569878e-01]]


In [163]:
embeddings = []

for word in emma_sentences[5]:
    tfidf = TFIDF(emma_sentences[5], word)
    embeddings.append(glove[word]* tfidf) 
    
tfidf_weighted_embedding = np.mean(embeddings, axis = 0).reshape(1, -1)

print(tfidf_weighted_embedding)

[[ 0.03390627  0.04567951 -0.02513047 -0.05553374  0.06523389  0.07031937
  -0.06309126  0.02674499 -0.01073998 -0.00509068  0.00518551  0.00818713
  -0.01610237 -0.01486281  0.04954961 -0.0107796  -0.05029558  0.00039276
  -0.0192399  -0.01344365 -0.01123742  0.08506534  0.02145731 -0.0159164
   0.04411737 -0.17889813 -0.04006272  0.01603446  0.02090289 -0.01344211
   0.28346797  0.00696015  0.00484046 -0.02637939  0.01537125  0.01611019
   0.00316879  0.0324516   0.00829024 -0.04200008 -0.0058922   0.01996137
  -0.00305491 -0.00355021  0.01175475 -0.03423196 -0.02943769 -0.06810232
  -0.00775695 -0.0181068 ]]


In [164]:
cosine_similarity(mean_embedding, tfidf_weighted_embedding)

array([[0.98653879]])