In [1]:
import nltk
import string
from collections import Counter
import re
import pandas as pd

In [2]:
def pre_process(text):  
    # lowercase
    text=text.lower()   
    #remove tags
    text=re.sub("<!--?.*?-->","",text)  
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)  
    return text

In [3]:
def get_tokens(sentence):
    tokens = nltk.word_tokenize(sentence)
    return tokens

In [4]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawanjeetkaur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def get_filtered_tokens(tokens):
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    return filtered

In [6]:
from nltk.stem.porter import *

In [7]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [8]:
def processtext(text, stemmer):
    tokens = get_tokens(pre_process(text))
    rm_stopwords = get_filtered_tokens(tokens)
    stemTokens = stem_tokens(rm_stopwords,stemmer)
    return stemTokens

run extraction/cleaning part for list of files

In [9]:
def get_words_in_files(files):
    text_arr = []
    size = len(files)
    for i in files[:size]:
        with open(i, 'r') as univ:
            text = univ.read()
            stemmer = PorterStemmer()
            proc_txt = processtext(text, stemmer)
            text_arr.append(proc_txt)
    
    return text_arr

creating frequency objects for all words

In [10]:
def count_words(word, words):
    count = 0
    for t in words:
        if t == word:
            count += 1
    return count
    

def get_word_count_total(text_each_doc):
    freq_per_doc = []
    size = len(text_each_doc)
    for i in range(size):
        words_counts = {}
        words = text_each_doc[i]
        for j in words:
            try:
                words_counts[j].add(count_words(j , words))
            except:
                words_counts[j] = count_words(j , words)
        freq_per_doc.append({"doc_id" : i , "freq" : words_counts , "doc_length": len(words)})
    return freq_per_doc

In [11]:
# calculating tf

def cal_tf(docs_info):
    tf_scores = []
    for doc in docs_info:
        doc_id = doc["doc_id"]
        frequency_words_list = doc["freq"]
        for word in frequency_words_list:
            score = frequency_words_list[word]/ doc["doc_length"]
            tf_scores.append({'doc_id': doc_id , 'word_key': word , "tf_score": score})
        
    return tf_scores



In [12]:
# calculating idf = ln(total number of docs / number of docs with term in it)
import math
def cal_idf(docs_info):
    idf_scores =[]

    for doc in docs_info:
        total_docs = len(docs_info)
        doc_id = doc["doc_id"]
        for word in doc["freq"].keys():
            count = 1
            for each_doc in docs_info:
                for word_1 in each_doc["freq"]:
                    if word == word_1:
                        count += 1   
            score =  math.log(total_docs / count)
            idf_scores.append({'doc_id': doc_id , 'word_key': word , "idf_score": score})
 
    return idf_scores
            
            

In [13]:
#calculate tfidf 
def cal_tf_idf(tf_scores, idf_scores):
    tfidf = {}
    for i in tf_scores:
        for j in idf_scores:
            if (i["word_key"] == j["word_key"]) and (i["doc_id"] == j["doc_id"]):
                score = i["tf_score"]*j["idf_score"]
                docID = i["doc_id"]
                word = i["word_key"]
                temp = {}
                try:
                    temp = tfidf[docID]
                except:
                    tfidf[docID] = {}
                temp[word] = score
                tfidf[docID] = temp   
    return tfidf
    

cosine similarity 

In [14]:
from math import sqrt

def cosine_sim(doc1, doc2):
    sqr_sum_1 = 0
    sqr_sum_2 = 0
    num = 0
    for word in doc1:
        if word in doc2.keys():
            tfidf1 = doc1[word]
            tfidf2 = doc2[word]
            num += tfidf1 * tfidf2
            sqr_sum_1 += (tfidf1) * (tfidf1)
            sqr_sum_2 += (tfidf2) * (tfidf2)
    cosine_sim = num / (sqrt(sqr_sum_1) * (sqrt(sqr_sum_2)))
    return cosine_sim

In [15]:
files = ['../test_1.txt', '../test_2.txt']

def process_everything(files):
    text_arr = get_words_in_files(files)
    tot_words_in_docs = get_word_count_total(text_arr)
    tf_score = cal_tf(tot_words_in_docs)
    idf_score = cal_idf(tot_words_in_docs)
    final = cal_tf_idf(tf_score, idf_score)
    return cosine_sim(final[0], final[1])


files = ['../UIC.txt', '../MIT.txt']
sim_1 = process_everything(files)
print("between uic and mit")
print(sim_1)

files = ['../UIC.txt', '../UIUC.txt']
sim_2 = process_everything(files)
print("between uic and UIUC")
print(sim_2)

files = ['../UIC.txt', '../Standford.txt']
sim_3 = process_everything(files)
print("between uic and standford")
print(sim_3)


files = ['../UIC.txt', '../Tesla.txt']
sim_4 = process_everything(files)
print("between uic and tesla")
print(sim_4)

files = ['../UIC.txt', '../UIS.txt']
sim_5 = process_everything(files)
print("between uic and uis")
print(sim_5)


between uic and mit
0.6533202324707292
between uic and UIUC
0.7845752749179364
between uic and standford
0.778343948284832
between uic and tesla
0.4919936036377691
between uic and uis
0.7759275725537244


Jacard similarity = (A intersection B ) / (A Union B)

In [16]:
def get_intersection(A , B):
    count = 0
    for word in A:
        if word in B:
            count += 1
    return count

In [17]:
def get_jaccard_sim(files):
    text_arr = get_words_in_files(files)
    tot_words_in_docs = get_word_count_total(text_arr)
    
    set_1 = tot_words_in_docs[0]["freq"].keys()
    set_2 = tot_words_in_docs[1]["freq"].keys()
    common_count = get_intersection(set_1 , set_2)
    union_count = len(set_1) + len(set_2) - common_count
    
    jacc_sim = common_count / union_count
    return jacc_sim


In [18]:
files = ['../UIC.txt', '../MIT.txt']
sim_1 = get_jaccard_sim(files)
print("between uic and mit")
print(sim_1)

files = ['../UIC.txt', '../UIUC.txt']
sim_2 = get_jaccard_sim(files)
print("between uic and UIUC")
print(sim_2)

files = ['../UIC.txt', '../Standford.txt']
sim_3 = get_jaccard_sim(files)
print("between uic and standford")
print(sim_3)


files = ['../UIC.txt', '../Tesla.txt']
sim_4 = get_jaccard_sim(files)
print("between uic and tesla")
print(sim_4)

files = ['../UIC.txt', '../UIS.txt']
sim_5 = get_jaccard_sim(files)
print("between uic and uis")
print(sim_5)

between uic and mit
0.2160023446658851
between uic and UIUC
0.28739130434782606
between uic and standford
0.2661625708884688
between uic and tesla
0.1937479648323022
between uic and uis
0.2473604826546003
