In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import defaultdict
import numpy as np
from numpy.linalg import norm
import pandas as pd

# Term-Document Matrix

In [4]:
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

df = pd.DataFrame(X.T.todense(),
                  index=vectorizer.get_feature_names_out(),
                  columns=[n for n in documents])

df

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names_out'

# Term-Term Matrix

In [None]:
def tokenize(text):
    
    if isinstance(text, list):
        text = " ".join(text)
    
    for char in """1234567890.,:;!?()~*\-"'\n""":
        text = text.replace(char," ")
    
    text = text.lower().split()
    
    return text

def term_term_matrix(documents, window_size):
    
    d = defaultdict(int)
    V = set()
    
    text = tokenize(documents)
    
    for i in range(len(text)):
        token = text[i]
        V.add(token)
        next_token = text[i+1 : i+1+window_size]
        for t in next_token:
            key = tuple(sorted([t,token]))
            d[key] += 1
    
    V = sorted(V)
    df = pd.DataFrame(data=np.zeros((len(V), len(V)), dtype=np.int16),
                      index=V,
                      columns=V)
    
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
        
    return df

In [None]:
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

term_term_matrix(documents, 4)

# Cosine Similarity

In [None]:
bible = open("bible.txt").read()

df = term_term_matrix(bible, 4)

df

In [None]:
god = df["god"]/norm(df["god" ])
glory = df["glory"]/norm(df["glory"])
slave = df["slave"]/norm(df["slave"])
devil = df["devil"]/norm(df["devil"])

print(god @ glory)
print(god @ slave)
print(god @ devil)

# Term Frequency - Inverse Document Frequency

In [None]:
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

df = pd.DataFrame(X.T.todense(),
                  index=vectorizer.get_feature_names_out(),
                  columns=[n for n in documents])

df

# Document Similarity

In [None]:
documents = [open("harrypotter1.txt").read(),
             open("harrypotter3.txt").read(),
             open("bible.txt").read()]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

df = pd.DataFrame(X.T.todense(),
                  index=[vectorizer.get_feature_names_out()],
                  columns=["HP 1", "HP 3", "Bible"])

df

In [None]:
print(df["HP 1"] @ df["HP 3"]/(norm(df["HP 1"])*norm(df["HP 3"])))
print(df["HP 1"] @ df["Bible"]/(norm(df["HP 1"])*norm(df["Bible"])))
print(df["HP 3"] @ df["Bible"]/(norm(df["HP 3"])*norm(df["Bible"])))

# Positive Pointwise Mutual Information

In [None]:
def PPMI(df,alpha=1):
    pij = df/df.sum().sum()
    pi  = np.sum(df.to_numpy(),axis=1)/df.sum().sum()
    pj  = np.sum(df.to_numpy(),axis=0)**alpha/(df.sum()**alpha).sum()
    A = np.log2(pij/(pi.reshape(pi.shape[0],1) @ pj.reshape(1,pj.shape[0])))
    #A[A<0] = 0
    return A

In [None]:
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

PPMI(term_term_matrix(documents, 4))

In [12]:
PPMI(term_term_matrix(documents, 4),alpha=0.75)

  result = func(self.values, **kwargs)


Unnamed: 0,and,document,first,is,one,second,the,third,this
and,0.0,0.0,0.0,0.587819,0.0,0.943335,0.587819,0.943335,0.0
document,0.0,0.0,0.389136,0.472342,0.0,0.242896,0.694735,0.0,0.248658
first,0.0,0.668006,0.0,0.587819,0.358373,0.0,0.002857,0.0,0.62717
is,0.135981,0.445613,0.282221,0.0,0.135981,0.135981,0.102392,0.135981,0.141743
one,0.0,0.0,0.504614,0.587819,0.0,0.0,0.587819,0.943335,0.62717
second,0.943335,0.668006,0.0,0.587819,0.0,0.0,0.0,0.0,0.62717
the,0.135981,0.668006,0.0,0.102392,0.135981,0.0,0.0,0.135981,0.62717
third,0.943335,0.0,0.0,0.587819,0.943335,0.0,0.587819,0.0,0.62717
this,0.0,0.235046,0.334689,0.15486,0.188448,0.188448,0.640287,0.188448,0.0


# Word Similarity

In [13]:
text = open("harrypotter1.txt").read()
ppmi = PPMI(term_term_matrix(text, 4),alpha=0.75)
cosine = np.zeros(len(ppmi))

  result = func(self.values, **kwargs)


In [14]:
target = "fred"
for i,word in enumerate(ppmi.index):
    cosine[i] = ppmi.loc[target] @ ppmi.loc[word]/(norm(ppmi.loc[target])*norm(ppmi.loc[word]))

for i in np.argsort(cosine)[-10:][::-1]:
    print(ppmi.index[i],":",cosine[i])

fred : 0.9999999999999999
george : 0.44162828783040325
weasley : 0.25733323660990437
misters : 0.23413829925206553
party : 0.1799598040111192
prefect : 0.1732174028120061
responsible : 0.17117505499384783
mere : 0.17102011235897505
oliver : 0.16714629752360366
chase : 0.16573116102229024


# Latent Semantic Analysis

In [15]:
text = open("harrypotter1.txt").read()
ppmi = PPMI(term_term_matrix(text, 4),alpha=0.75)

V,S,D = np.linalg.svd(ppmi.to_numpy())

K = 200
df = pd.DataFrame(data=V[:,:K],
                  index=ppmi.index)

df

  result = func(self.values, **kwargs)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
a,-0.024513,-0.002009,0.053175,0.029953,-0.067987,0.000886,-0.001711,0.009179,0.007623,-0.000178,...,-0.015731,0.024517,0.011897,-0.015370,-0.012497,0.009113,0.008741,-0.003935,0.001203,0.014117
aaaaaaaaaargh,-0.006467,-0.007308,-0.004428,0.003676,-0.016272,-0.000561,0.002014,0.013575,0.002206,-0.001904,...,0.017097,-0.011797,0.013328,-0.001343,0.001773,0.014691,0.008925,-0.006075,-0.013140,-0.004045
aaaargh,-0.005044,-0.001409,-0.001799,-0.017806,-0.001677,-0.005613,0.011318,0.007801,0.011171,0.000204,...,0.011257,0.002292,-0.003620,0.010465,-0.002174,0.016402,0.008784,-0.007508,-0.013993,-0.000825
aaah,-0.003107,0.003959,-0.004221,-0.004086,-0.002184,-0.002632,0.003947,-0.008891,-0.001784,0.001551,...,-0.017995,0.024705,-0.004634,-0.002292,-0.012402,-0.021749,-0.034486,0.005221,-0.029446,0.006735
aargh,-0.008412,-0.007081,-0.012034,-0.012007,0.007467,0.006550,0.021455,-0.006598,-0.001580,-0.012125,...,-0.001520,0.006101,-0.000880,-0.007075,-0.008577,-0.014793,0.015152,-0.007009,-0.001453,0.002674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombie,-0.010668,-0.012984,-0.010473,0.008511,0.008265,0.000282,-0.023759,0.008861,0.017075,0.008577,...,0.003949,0.009656,0.009043,0.010214,0.035160,0.008141,0.002263,-0.001989,-0.000343,-0.001825
zoo,-0.016196,-0.005896,-0.005843,-0.004352,0.009394,0.011178,-0.002830,0.001623,-0.019821,0.013153,...,0.007291,0.016538,-0.009724,0.013578,0.003233,-0.016376,-0.011147,0.015449,0.002270,-0.033980
zoom,-0.006122,-0.004927,-0.001071,-0.006782,0.005180,0.004628,0.020436,0.010169,0.005768,-0.002991,...,-0.008257,-0.009619,0.008511,-0.007289,0.006009,0.000276,-0.004071,-0.004469,0.009864,-0.002568
zoomed,-0.008075,-0.019652,0.002937,-0.003357,0.018906,-0.009339,-0.010859,0.004267,0.001732,-0.017905,...,-0.004394,-0.008420,0.002378,0.009651,-0.004060,0.005364,-0.002502,-0.000765,0.004880,0.011093


In [16]:
cosine = np.zeros(len(df))

target = "fred"
for i,word in enumerate(df.index):
    cosine[i] = df.loc[target] @ df.loc[word]/(norm(df.loc[target])*norm(df.loc[word]))

for i in np.argsort(cosine)[-10:][::-1]:
    print(df.index[i],":",cosine[i])

fred : 1.0000000000000002
george : 0.8147000895756878
misters : 0.676451414136882
chase : 0.6188418461492329
weasley : 0.5794638326579802
party : 0.49461908258651904
bounded : 0.48968289427468004
tripe : 0.4580777913311184
ourselves : 0.4561894266632011
jokes : 0.4526611098833193
