In [1]:
import math
import numpy as np
from numpy import dot
from numpy.linalg import norm
from collections import Counter
from numpy import linalg as LA

np.set_printoptions(suppress=True)

In [2]:
# helper methods # 
def preprocess(filename):
    # open file
    file = open(filename, "rb")
    lines = file.read() # convert all to lowercase; type = str
    # tokenize
    tokens = lines.split()
    # pre-processing
    ct = Counter()
    for token in tokens:
        ct[token]+= 1
    return ct

def get_TFIDF_vector(doc, templ, corp_length, file_count):
    max_c = doc.most_common(1)[0][1]
    # print(max_c)
    v = [0] * corp_length
    for w in doc:
        c = doc.get(w)
        tf_w = c / max_c
        idf_w = math.log(file_count / docs_contain_W(w), 10)
        v[templ.index(w)] = tf_w * idf_w 
    return v

def docs_contain_W(w):
    count = 0
    for doc in docs:
        if doc.__contains__(w):
            count += 1
    return count


def build_corpora(docs):
    corp_temp = Counter()
    for doc in docs:
        corp_temp += doc
    occurrance = sum(corp_temp.values())
    wordType =  len(corp_temp)
    corpora = []
    for x, y in list(corp_temp.most_common()):
        corpora.append(x)
    return corpora

# Question 1 Warm up

## 1a)

In [3]:
# read files
doc1 = preprocess('./q1/doc1.txt')
doc2 = preprocess('./q1/doc2.txt')
doc3 = preprocess('./q1/doc3.txt')
file_count = 3
docs = [doc1, doc2, doc3]

# build corpora
corpora = build_corpora(docs)
corp_length = len(corpora)

## 1b) bulid TF-IDF expressions

In [4]:
tf_idf = np.zeros((corp_length, file_count))
for i in range(file_count):
    tf_idf[:, i] = get_TFIDF_vector(docs[i], corpora, corp_length, file_count)
    
titles = ["doc1      ", "doc2      ", "doc3"]
print(titles)
print(tf_idf)

['doc1      ', 'doc2      ', 'doc3']
[[0.         0.47712125 0.        ]
 [0.47712125 0.         0.        ]
 [0.47712125 0.         0.        ]
 [0.         0.31808084 0.        ]
 [0.         0.31808084 0.        ]
 [0.         0.31808084 0.        ]
 [0.23856063 0.         0.        ]
 [0.23856063 0.         0.        ]
 [0.23856063 0.         0.        ]
 [0.23856063 0.         0.        ]
 [0.         0.15904042 0.        ]
 [0.         0.15904042 0.        ]
 [0.         0.15904042 0.        ]
 [0.         0.15904042 0.        ]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.47712125]
 [0.         0.         0.4

## 1c) SVD

In [5]:
U,s,VT = np.linalg.svd(tf_idf,full_matrices=False)
topic_doc = s * VT
topic_term = s * U
print("topic_doc matrix")
print(topic_doc)
print("\n")
print("topic_term matrix")
print(topic_term)

topic_doc matrix
[[-0.         -0.         -0.79520209]
 [-1.96722133 -0.         -0.        ]
 [ 0.          0.82639825  0.        ]]


topic_term matrix
[[ 0.          0.          0.47712125]
 [ 0.         -0.47712125  0.        ]
 [ 0.         -0.47712125 -0.        ]
 [ 0.          0.          0.31808084]
 [ 0.          0.          0.31808084]
 [ 0.          0.          0.31808084]
 [ 0.         -0.23856063 -0.        ]
 [ 0.         -0.23856063 -0.        ]
 [ 0.         -0.23856063 -0.        ]
 [ 0.         -0.23856063 -0.        ]
 [ 0.          0.          0.15904042]
 [ 0.          0.          0.15904042]
 [ 0.          0.          0.15904042]
 [ 0.          0.          0.15904042]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.          0.        ]
 [-0.47712125  0.         

# Question 2  Cosine Similarity 

## 2a) build TF-IDF expressions

In [6]:
# read files
doc1 = preprocess('./q2/doc1.txt')
doc2 = preprocess('./q2/doc2.txt')
doc3 = preprocess('./q2/doc3.txt')
file_count = 3
docs = [doc1, doc2, doc3]

# build corpora
corpora = build_corpora(docs)
corp_length = len(corpora)

# build tf-itf expressions
tf_idf = np.zeros((corp_length, file_count))
for i in range(file_count):
    tf_idf[:, i] = get_TFIDF_vector(docs[i], corpora, corp_length, file_count)
    
print(tf_idf)

[[0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 ...
 [0.         0.         0.00745502]
 [0.         0.         0.00745502]
 [0.         0.         0.00745502]]


## 2b) cosine similarity 

In [7]:
def cosine_sim(v1, v2):
    cos_sim = dot(v1, v2)/(norm(v1)*norm(v2))
    return cos_sim

In [8]:
# cosine similarity of 2 related files
sim1 = cosine_sim(tf_idf[:, 0], tf_idf[:, 1])
print("the cosine similarity between doc 1 and doc 2 files = ", sim1)

# cosine similarity of 2 unrelated files
sim2 = cosine_sim(tf_idf[:, 0], tf_idf[:, 2])
print("the cosine similarity between doc 1 and doc 3 files = ", sim2)

the cosine similarity between doc 1 and doc 2 files =  0.04619458366523629
the cosine similarity between doc 1 and doc 3 files =  0.005158837754693584


# Question 3: SVD in LSA

## 3a) build SVD

In [11]:
# read files 
list_of_names = np.arange(1, 31)
group1 = np.arange(0, 20)
group2 = np.arange(10, 20)
group3 = np.arange(20, 30)
file_count = len(list_of_names)
docs = []
for i in range(file_count):
    doc = preprocess("./q3/doc" + str(list_of_names[i]) + ".txt")
    docs.append(doc)

# build corpora
corpora = build_corpora(docs)
corp_length = len(corpora)

# build TF-IDF expressions
tf_idf = np.zeros((corp_length, file_count))
for i in range(file_count):
    tf_idf[:, i] = get_TFIDF_vector(docs[i], corpora, corp_length, file_count)

# regular SVD
U,s,VT = np.linalg.svd(tf_idf,full_matrices=False)
topic_doc = s * VT.T
topic_term = s * U

topic_doc = np.absolute(topic_doc)
topic_doc = topic_doc / sum(topic_doc)
np.set_printoptions(suppress=True)
np.savetxt("topic_doc.csv", topic_doc, delimiter=",")

## 3b) Cosine Similarity 

In [10]:
# intra-group similarity for a given doc
ref_file = topic_doc[:, 0]  # doc1.txt
intra_sims = [];
for i in group1:
    sim = cosine_sim(topic_doc[:,i], ref_file)
    intra_sims.append(sim)
print("intra-group similarities = \n", intra_sims)
intra_sims = np.delete(intra_sims, [0])
avg_intra_sim = np.average(intra_sims)
print("average intra-group similarity = ", avg_intra_sim)

# inter-group similarity for the same doc
inter_sims = [];
for i in range(file_count):
    if i in group1:
        continue
    sim = cosine_sim(topic_doc[:,i], ref_file)
    inter_sims.append(sim)
print("inter-group similarities = \n", inter_sims)
avg_inter_sim = np.average(inter_sims)
print("average inter-group similarity = ", avg_inter_sim)

intra-group similarities = 
 [1.0000000000000002, 0.46119291712467914, 0.3128200817977986, 0.351194160450727, 0.20652809802211392, 0.5497952825846238, 0.103929701466062, 0.15978044041613068, 0.24160164015197202, 0.12674644914591957, 0.1423517131625161, 0.15977641506823348, 0.22995066374834, 0.16623590913703315, 0.16084885971778035, 0.4101032674780667, 0.15033430454526567, 0.3151120636210163, 0.13278481390004498, 0.1539124493776992]
average intra-group similarity =  0.23868417004821174
inter-group similarities = 
 [0.11888892678785296, 0.07532405121376341, 0.10620201007276478, 0.2640757139552464, 0.22834167732503063, 0.24132929233602696, 0.1359301239014523, 0.05638144038980467, 0.06530661970879373, 0.04361743876548238]
average inter-group similarity =  0.1335397294456218
