# Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
import Matcher
import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()

dataset = FileManager.csv_to_dict('wikir/stemmed4.csv')
datasets = [dataset]

## Stemming

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], 'Snowball')
datasets.append(stemmed_dataset)

## OR Lemmatization

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Removing Stopwords

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

# getsizeof(datasets[-1]) #15379200

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_key = list(datasets[-1].keys())
# import pandas as pd
# tfidf_matrix
# df = pd.DataFrame(
#         tfidf_matrix.toarray(),
#         columns=vectorizer.get_feature_names_out(),
#         index=datasets[-1].keys(),
#     )

# df

## Calculate Cosine Sim

In [None]:
related_docs = Matcher.calculate_cos_similarity(df,df.iloc[8])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

# Query Manipulation 

## Enter query

### Manual

In [None]:
from nltk.tokenize import word_tokenize
query = 'hello sister where is your hijab'
query = word_tokenize(query)

### From File

In [None]:
queries = FileManager.csv_to_dict('wikir/queries.csv')

## Removing Stop Words

In [None]:
query = WordCleaner.remove_stop_words(query)

## Stem or Lem

In [None]:
query = WordCleaner.stem(query, 'Snowball')

### From File

In [None]:
for key in queries.keys():
    queries[key] = WordCleaner.stem(queries[key], 'Snowball')

## Calculate TF-IDF

In [None]:
qdf = Indexer.calculate_doc_tf_idf([' '.join(query)],vectorizer)

qdf

### From File

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([' '.join(queries[key])],vectorizer)

## Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(tfidf_matrix,qdf,0.25)

for row in similar_rows.items():
    print(row)

### From File

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_key,0.25)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","testrun","[nDCG @ 10, P @ 5, P(rel=2) @ 5, Judged @ 10]")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file('stemmed255.csv',datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file('testrun',queries,queriesAnswers)

## Model

In [None]:
FileManager.write_model_to_file('model.pickle',tfidf_matrix)

# ddf = FileManager.load_model_from_file("model.pickle")

# ddf