### Submitted by Sara Sameer, 19k-1255
#### Assignment 02: Vector Space Model
##### Abstract:This is an implementation of vector space model using tf-idf weighing scheme. The output shows retrieved results in ranked order based on their cosine similarility. 
    

In [21]:
import re
from nltk.stem import PorterStemmer
import string
import numpy as np

### Data Cleaning

In [22]:
sw= []
with open ("Stopword-List.txt","r") as file:
    for line in file:
        sw+=line.split()
            
def stopwords_removal(line):
    line_without_sw = [word for word in line.split() if not word in sw]
    return line_without_sw
  

In [23]:
def cleaning(term):
        ps = PorterStemmer()
        term = term.lower()  #Normalize text
        term = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", term) #Remove Unicode
        term = re.sub(r'[^\w\s]','',term)
        term = re.sub(r'[0-9]', '', term)
        term = ps.stem(term)  #Stemming
        return term

### Inverted Index

In [24]:
tokens_dict = dict()

for i in range(1,449):
    doc_no = i
    list_of_words = []
    with open ("Abstracts/" + str(doc_no) + ".txt","r") as file:
        # Removing punctations manually because some conventions are modified.
        file = file.read().replace(".","").replace("n't"," not").replace("'","").replace("]"," ").replace("[","").replace(","," ").replace("?","").replace("\n"," ").replace("-"," ").replace('/'," ").split()
        for line in file: 
            line = stopwords_removal(line)
            for word in line:
                word = cleaning(word)
                if word in tokens_dict:
                    tokens_dict[word].add(i)
                else:
                    tokens_dict[word]= {i}  #To add more docs
                

### Inverse Document Frequency (IDF)

In [25]:
# idf_dict = dict()
# idf_dict
#IDF is universal value
def IDF(word):
    len_docs = 448 # Number of documents
    for k,v in tokens_dict.items():
        if word == k:
            return 1+ np.log(len_docs / len(v))
#             return round(np.log(len_docs / len(v)),5)

    

### Term Frequency (TF)

In [26]:
#TF varies from doc to doc
#Assuming a cleaned doc
def normalized_TF(word,doc, len_doc):
    word_count = 0
    for term in doc:
        if word == term:
            word_count +=1  
    #print(round((word_count/len_doc),2))
    return word_count
    #return round((word_count/len_doc),2)
            
    


### TF-IDF Scoring

In [27]:
def tf_idf(doc):
    vec = list()
    for word in doc:
        tf = normalized_TF(word,doc,len(doc))
        idf = IDF(word)
        score = tf*idf
        vec.append(score)
        
    return vec
        

In [28]:
cleaned_docs = []

def text_cleaning():
    for i in range(1,449):
        #print(i)
        temp_doc = []
        with open ("Abstracts/" + str(i) + ".txt","r") as file:
        # Removing punctations manually because some conventions are modified.
            file = file.read().replace(".","").replace("n't"," not").replace("'","").replace("]"," ").replace("[","").replace(","," ").replace("?","").replace("\n"," ").replace("-"," ").replace('/'," ").split()
            for line in file: 
                line = stopwords_removal(line)
                for word in line:
                    word = cleaning(word)
                    if word != "":
                        temp_doc.append(word)
        cleaned_docs.append(temp_doc)
        


### Document Vectors

In [29]:
text_cleaning()
# cleaned_docs[448]

In [34]:
doc_vec = []

def doc2vector(): 
    for index,doc in enumerate(cleaned_docs):
        vec =tf_idf(doc)
        doc_vec.append(vec)
doc2vector() 

# doc_vec[447]

### Cosine Similarity

In [31]:
#This function will make doc_vector that have tf_idf calculated for each term of every document
def cosine_similarity(query_score, doc_score):
#     print(np.dot(query_score,doc_score))
    dot_prod = np.dot(query_score,doc_score)
    query_vec_len = np.square(query_score).sum()
    doc_vec_len = np.square(doc_score).sum()
#     print(query_vec_len)
#     print(doc_vec_len)
    
    cosine_sim = dot_prod / (query_vec_len * doc_vec_len)
#     print(cosine_sim)
    return cosine_sim
    
    
        
        
            
            
            
        
        
        
    

### Query Processing

In [32]:
def main():
    query = input()
    query = stopwords_removal(query)
    query_list = []
    tf_idf_query_score = []
    cos_sim = []
    
    for terms in query:
        terms = cleaning(terms)
        query_list.append(terms)   
    tf_idf_query_score = tf_idf(query_list)
    

    docs_intersect_set = set()
    for index, terms in enumerate(query_list):
        for k, v in tokens_dict.items():
            if terms == k:
                for docs in v:
                    docs_intersect_set.add(docs)  
    
    docs_intersect_set = sorted(docs_intersect_set)
    for items in docs_intersect_set:
        docs_score = list()
        for terms in query_list:
#             print(terms)
#             print(items)
            try:
                index_ = cleaned_docs[items-1].index(terms)
                docs_score.append(doc_vec[items-1][index_])
                
            except ValueError:
                docs_score.append(0)   #For terms that are present in query, but not in document
                
        cos_sim.append(cosine_similarity(tf_idf_query_score, docs_score))
        cos_sim = sorted(cos_sim, reverse = True)
        
    
    total_doc = 0
    for index, value in enumerate(cos_sim):
            if value >= 0.001:  # Aplha value is 0.001
                print (f'Doc No: {docs_intersect_set[index]} is retrieved with cosine similarity value of : {value}')
#                 print()
                total_doc = total_doc +1
    
    print(f'Total documents retrieved: {total_doc}')
#     print(str(vec))

# Check Markov term in 110 with brackets markov(k) do cleaning

        

In [33]:
main()

weak heuristic
Doc No: 1 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 35 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 93 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 101 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 136 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 172 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 174 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 213 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 257 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 299 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 306 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 361 is retrieved with cosine similarity value of : 0.019497156334481312
Doc No: 413 is retrieved with

In [4]:
dict_ = {'one': 221, 'two': 235, 'three': 544}
data = list(dict_.keys())
numbers = list(dict_.values())
numbers

[221, 235, 544]