## Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
import Matcher
from sklearn.feature_extraction.text import TfidfVectorizer
%load_ext autoreload
%autoreload 2

## Load Files

In [None]:
vectorizer = TfidfVectorizer()

dataset = FileManager.csvToDict('testing.csv')
datasets = [dataset]

## Stemming

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_words = WordCleaner.stem(dataset[row], 'porter')
    stemmed_dataset[row] = stemmed_words
datasets.append(stemmed_dataset)

## OR Lemmatization

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    stemmed_words = WordCleaner.lemmatize(dataset[row])
    lemmad_dataset[row] = stemmed_words
datasets.append(lemmad_dataset)

## Removing Stopwords

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.removeStopWords(dataset[key])
datasets.append(filtered_dataset)

# getsizeof(datasets[-1]) #15379200

## Creating the inverted index

In [None]:
inverted_index = Indexer.getInvertedIndex(datasets[-1])
inverted_index

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
(tfidf_matrix, df) = Indexer.calculateTF_IDF(datasets[-1], vectorizer)

df

## Calculate Cosine Sim

In [None]:
related_docs = Matcher.calcCosSimWithCorpus(df,df.iloc[8])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

## Query Manipulation 

### Enter the Query

In [None]:
from nltk.tokenize import word_tokenize
query = 'played college football university tennessee frost attended franklin high school accepting football scholarship university tennessee held team sophomore 1960 teammates voted outstanding lineman 1961 suspended meeting school academic requirements left school college eligibility join nfl frost signed undrafted free agent dallas cowboys 1961 nfl draft time seen coup skills compared first round draft choice bob lilly although backup named nfl rookie team became starter defensive tackle second season suffered torn ligaments right knee third game los angeles rams lost year injury would eventually end career may 13 1963 traded cleveland browns exchange third round draft choice 39 roger pillath frost waived cleveland browns september 3 1963 1963,'
query = word_tokenize(query)

### Removing Stop Words

In [None]:
query = WordCleaner.removeStopWords(query)
query

### Stem or Lem

In [None]:
query = WordCleaner.stem(query, 'porter')

### Calculate TF-IDF

In [None]:
(query_tfidf_matrix, qdf) = Indexer.calculateDocTF_IDF([' '.join(query)],vectorizer)

qdf

### Calculate Cos Sim

In [None]:
similar_rows = Matcher.getSimilarRows(df,qdf,0.1)

for row in similar_rows.items():
    print(row)

## Write To File

In [None]:
file_writer, file = FileManager.openCSVWriter('stemmed.csv',['id','text'])
for key in datasets[-1]:
    file_writer.writerow({'id': key, 'text': datasets[-1][key]})
file.close()