## Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
import Matcher
%load_ext autoreload
%autoreload 2

## Load Files

In [None]:
dataset = FileManager.csvToDict('testing.csv')
datasets = [dataset]

## Stemming

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_words = WordCleaner.stem(dataset[row], 'porter')
    stemmed_dataset[row] = stemmed_words
datasets.append(stemmed_dataset)

## OR Lemmatization

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    stemmed_words = WordCleaner.lemmatize(dataset[row])
    lemmad_dataset[row] = stemmed_words
datasets.append(lemmad_dataset)

## Removing Stopwords

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.removeStopWords(dataset[key])
datasets.append(filtered_dataset)

## Creating the inverted index

In [None]:
inverted_index = Indexer.getInvertedIndex(datasets[-1])
inverted_index

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
(tfidf_matrix, df) = Indexer.calculateTF_IDF(datasets[-1])

df

### Manually

In [None]:
(tfidf_matrix, df) = Indexer.calculateManualTF_IDF(datasets[-1])

df

## Calculate Cosine Sim

In [None]:
related_docs = Matcher.calcCosSimWithCorpus(df,df.iloc[8])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

## Query Manipulation 

### Enter the Query

In [None]:
from nltk.tokenize import word_tokenize
query = 'government acquired the cherokee outlet under congressional'
query = word_tokenize(query)

### Removing Stop Words

In [None]:
query = WordCleaner.removeStopWords(query)
query

### Stem or Lem

In [None]:
query = WordCleaner.stem(query, 'porter')
query

### Calculate TF-IDF

In [None]:
all_tokens = []
for key in datasets[-1]:
    for token in datasets[-1][key]:
        if token not in all_tokens:
            all_tokens.append(token)
                
(query_tfidf_matrix, qdf) = Indexer.calculateDocTF_IDF(datasets[-1],all_tokens,query)

qdf
# query_tfidf_matrix 

### Calculate Cos Sim

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

related_docs = Matcher.calcCosSimWithCorpus(df,qdf.iloc[0])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

## Write To File

In [None]:
file_writer, file = FileManager.openCSVWriter('stemmed.csv',['id','text'])
for key in datasets[-1]:
    file_writer.writerow({'id': key, 'text': datasets[-1][key]})
file.close()