# Import Libraries

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("testing.csv")
datasets = [dataset]

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()

dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"wikir/stemmed{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    if int(key) % 100 == 0:
        print(key)
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

In [None]:
dataset = datasets[-1]
processed_dataset = {}
for key in dataset:
    processed_dataset[key] = WordCleaner.process_capital_punctuation(dataset[key])
datasets.append(processed_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
dataset = datasets[-1]
mapped_dataset = WordCleaner.synonym_map_corpus(dataset)
datasets.append(mapped_dataset)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_key = list(datasets[-1].keys())

# Query Manipulation 

## Manual Query

In [None]:
query = "hello sister where is your hijab"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
query = WordCleaner.stem(query, 'Snowball')
# query = WordCleaner.lemmatize(query)
query = [WordCleaner.get_unified_synonym(word) for word in query]
print(query)

### Calculate TF-IDF

In [None]:
matrix = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)

### Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(tfidf_matrix,matrix,dataset_key,0.25)

for row in similar_rows.items():
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv")

In [None]:
# TODO get unified synonym needs testing
# for key in queries.keys():
#     queries[key] = [WordCleaner.get_unified_synonym(word) for word in queries[key]]
for key in queries.keys():
    queries[key] = WordCleaner.stem(queries[key], "Snowball")
# for key in queries.keys():
#     queries[key] = WordCleaner.lemmatize(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_key,0.25)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","testrun","[nDCG @ 10, P @ 5, P(rel=2) @ 5, Judged @ 10]")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("p.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("testrun",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_file("model.npz",tfidf_matrix)

### Read

In [None]:
ddf = FileManager.load_model_from_file("model.npz")
ddf