In [1]:
import re
from nltk.stem import PorterStemmer
import string
import numpy as np

### Data Cleaning

In [2]:
sw= []
with open ("Stopword-List.txt","r") as file:
    for line in file:
        sw+=line.split()
            
def stopwords_removal(line):
    line_without_sw = [word for word in line.split() if not word in sw]
    return line_without_sw
  

In [3]:
def cleaning(term):
        ps = PorterStemmer()
        term = term.lower()  #Normalize text
        term = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", term) #Remove Unicode
        term = re.sub(r'[^\w\s]','',term)
        term = re.sub(r'[0-9]', '', term)
        term = ps.stem(term)  #Stemming
        return term

### Inverted Index

In [4]:
tokens_dict = dict()

for i in range(1,448):
    doc_no = i
    list_of_words = []
    with open ("Abstracts/" + str(doc_no) + ".txt","r") as file:
        # Removing punctations manually because some conventions are modified.
        file = file.read().replace(".","").replace("n't"," not").replace("'","").replace("]"," ").replace("[","").replace(","," ").replace("?","").replace("\n"," ").replace("-"," ").replace('/'," ").split()
        for line in file: 
            line = stopwords_removal(line)
            for word in line:
                word = cleaning(word)
                if word in tokens_dict:
                    tokens_dict[word].add(i)
                else:
                    tokens_dict[word]= {i}  #To add more docs
                

In [5]:
# with open('your_file.txt', 'w') as f:
#     for item in my_list:
#         f.write("%s\n" % item)
# For saving cleaned version



tokens_dict['bootstrap']

{181, 193, 379}

### Inverse Document Frequency (IDF)

In [185]:
# idf_dict = dict()
# idf_dict
#IDF is universal value
def IDF(word):
    len_docs = 448 # Number of documents
    for k,v in tokens_dict.items():
        if word == k:
            #print(round(1+ np.log(len_docs / len(v)),5))
            return round(1+ np.log(len_docs / len(v)),5)
#             return round(np.log(len_docs / len(v)),5)

    

### Term Frequency (TF)

In [194]:
#TF varies from doc to doc
#Assuming a cleaned doc
def normalized_TF(word,doc, len_doc):
    word_count = 0
    for term in doc:
        if word == term:
            word_count +=1  
    #print(round((word_count/len_doc),2))
    return word_count
    #return round((word_count/len_doc),2)
            
    
# normalized_TF('love', 'love elan',2) 

### TF-IDF Scoring

In [195]:
def tf_idf(doc):
    vec = list()
    for word in doc:
        tf = normalized_TF(word,doc,len(doc))
        idf = IDF(word)
        score = round((tf*idf),5)
        vec.append(score)
        
    return vec
        
# tf_idf('curs phenomena often linear curs avoid')

In [196]:
cleaned_docs = []

def text_cleaning():
    for i in range(1,448):
        temp_doc = []
        with open ("Abstracts/" + str(i) + ".txt","r") as file:
        # Removing punctations manually because some conventions are modified.
            file = file.read().replace(".","").replace("n't"," not").replace("'","").replace("]"," ").replace("[","").replace(","," ").replace("?","").replace("\n"," ").replace("-"," ").replace('/'," ").split()
            for line in file: 
                line = stopwords_removal(line)
                for word in line:
                    word = cleaning(word)
                    if word != "":
                        temp_doc.append(word)
            cleaned_docs.append(temp_doc)


### Document Vectors

In [197]:
text_cleaning()

In [198]:
doc_vec = []

def doc2vector(): 
    for doc in cleaned_docs:
        vec =tf_idf(doc)
        doc_vec.append(vec)
doc2vector() 

doc_vec[378][124]

5.00618

### Cosine Similarity

In [223]:
#This function will make doc_vector that have tf_idf calculated for each term of every document
def cosine_similarity(query_score, doc_score):
#     print(np.dot(query_score,doc_score))
    dot_prod = np.dot(query_score,doc_score)
    query_vec_len = np.square(query_score).sum()
    doc_vec_len = np.square(doc_score).sum()
#     print(query_vec_len)
#     print(doc_vec_len)
    
    cosine_sim = dot_prod / (query_vec_len * doc_vec_len)
#     print(cosine_sim)
    return round(cosine_sim,8)
    
    
        
        
            
            
            
        
        
        
    

### Query Processing

In [224]:
def main():
    query = input()
    query = stopwords_removal(query)
    query_list = []
    tf_idf_query_score = []
    cos_sim = []
    
    for terms in query:
        terms = cleaning(terms)
        query_list.append(terms)   
    tf_idf_query_score = tf_idf(query_list)
    

    docs_intersect_set = set()
    for index, terms in enumerate(query_list):
        for k, v in tokens_dict.items():
            if terms == k:
                for docs in v:
                    docs_intersect_set.add(docs)  
    
    docs_intersect_set = sorted(docs_intersect_set)
    for items in docs_intersect_set:
        docs_score = list()
        for terms in query_list:
#             print(terms)
#             print(items)
            try:
                index_ = cleaned_docs[items-1].index(terms)
#                 print(f'index is {index_}')
#                 print(index_)
#                 print(items-1)
#                 print(doc_vec[items-1][index_])
                docs_score.append(doc_vec[items-1][index_])
                
            except ValueError:
                docs_score.append(0)
                
#         print(docs_score)
#         print(tf_idf_query_score)
#         print("Cosine Similarity is ")
        print(cosine_similarity(tf_idf_query_score, docs_score))
        cos_sim.append(cosine_similarity(tf_idf_query_score, docs_score))
        
    
    
    
    for index, value in enumerate(cos_sim):
            if value > 0.001:  #Keep round to 5
                print (docs_intersect_set[index])
        
#     print(str(vec))
# Check Markov term in 110 with brackets markov(k) do cleaning

        

In [225]:
main()

markov
0.10012203
0.02503051
0.10012203
0.03337401
0.10012203
0.02503051
0.02503051
0.10012203
0.10012203
0.03337401
0.05006101
0.05006101
0.10012203
0.10012203
0.05006101
0.03337401
0.10012203
0.10012203
0.10012203
11
16
22
69
129
149
197
230
251
257
260
289
305
312
323
335
381
439
445


In [175]:
a = [0.702753576, 0.702753576]
b = [0.140550715, 0.140550715]
print(np.dot(a,b))

b = [3.5524, 3.5524]
a = [0.07105, 0.42629]
print(np.dot(b,a))

np.square(b).sum()

0.19754503515121366
1.766750616


25.23909152

In [81]:
doc_vec

[{'ensembl': 0.15877,
  'statist': 0.09764,
  'heurist': 0.14407,
  'model': 0.08788,
  'unsupervis': 0.07939,
  'word': 0.21661,
  'align': 0.55279,
  'learn': 0.03005,
  'need': 0.03276,
  'larg': 0.02856,
  'amount': 0.03847,
  'train': 0.04635,
  'data': 0.03104,
  'while': 0.02994,
  'they': 0.03344,
  'weak': 0.05313,
  'small': 0.03847,
  'size': 0.03773,
  'corpora': 0.05159,
  'thi': 0.04443,
  'paper': 0.01503,
  'propos': 0.01675,
  'new': 0.0252,
  'approach': 0.03903,
  'hybrid': 0.03969,
  'techniqu': 0.02317,
  'use': 0.04931,
  'method': 0.03623,
  'algorithm': 0.05434,
  'three': 0.03255,
  'base': 0.04738,
  'sever': 0.02962,
  'round': 0.07105,
  'gener': 0.02351,
  'the': 0.02769,
  'weigh': 0.07105,
  'scheme': 0.04014,
  'resampl': 0.05718,
  'vote': 0.05313,
  'score': 0.03886,
  'consid': 0.03027,
  'aggreg': 0.04707,
  'underli': 0.0462,
  'studi': 0.02309,
  'includ': 0.03044,
  'ibm': 0.07105,
  'dice': 0.06412,
  'measur': 0.02978,
  'our': 0.01975,
  'exper