# Import Libraries

In [None]:
# %pip install -U torch-2.3.0-cp311-cp311-win_amd64.whl
# %pip install -U sentence-transformers

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=100, algorithm="arpack")
dataset = FileManager.csv_to_dict("wikir/RL3.csv")
datasets = [dataset]

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=50, algorithm="arpack")


dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"lotte/lemlot{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    if int(key) % 100 == 0:
        print(key)
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

In [None]:
dataset = datasets[-1]
processed_dataset = {}
for key in dataset:
    processed_dataset[key] = WordCleaner.process_capital_punctuation(dataset[key])
datasets.append(processed_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
dataset = datasets[-1]
mapped_dataset = WordCleaner.synonym_map_corpus(dataset)
datasets.append(mapped_dataset)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_keys = list(datasets[-1].keys())

#### LSA

In [None]:
svd = TruncatedSVD(n_components=50, algorithm="arpack")
lsa_matrix = Indexer.calculate_lsa(tfidf_matrix,svd)

# Query Manipulation 

## Manual Query

In [None]:
query = "hello sister where is your hijab"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
query = WordCleaner.stem(query, 'Snowball')
# query = WordCleaner.lemmatize(query)
query = [WordCleaner.get_unified_synonym(word) for word in query]
print(query)

### Calculate TF-IDF

In [None]:
matrix = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)

In [None]:
matrix = Indexer.calculate_doc_lsa([" ".join(query)],svd)


### Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(lsa_matrix,matrix,dataset_keys,0.5)

for row in similar_rows.items():
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv")

### Lotte queries loader

In [None]:
queries = FileManager.tsv_to_dict("lotte/questions.forum.tsv") #| FileManager.tsv_to_dict("lotte/questions.search.tsv")

In [None]:
FileManager.jsonl_to_tsv("lotte/qas.forum.jsonl", "qrels0")

### Text Processing

In [None]:
# TODO get unified synonym needs testing
# for key in queries.keys():
#     queries[key] = WordCleaner.remove_stop_words(queries[key])
# for key in queries.keys():
#     queries[key] = [WordCleaner.get_unified_synonym(word) for word in queries[key]]

for key in queries.keys():
    queries[key] = WordCleaner.remove_stop_words(queries[key])
for key in queries.keys():
    queries[key] = WordCleaner.process_capital_punctuation(queries[key])
# for key in queries.keys():
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
for key in queries.keys():
    queries[key] = WordCleaner.lemmatize(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

In [None]:
queries_matrices2 = {}
for key in queries_matrices.keys():
    queries_matrices2[key] = Indexer.calculate_doc_lsa(queries_matrices[key],svd)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers_optimized(tfidf_matrix,queries_matrices[key],dataset_keys,0.9)

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers_optimized(lsa_matrix,queries_matrices2[key],dataset_keys,0.95)

# Evaluation

In [None]:
Evaluater.evaluate("lotte/qrels0","lotte_rl_SVA3.run")

In [None]:
Evaluater.evaluate("lotte/qrels0","lotte_rl_SVA1.run")

In [None]:
Evaluater.evaluate("lotte/qrels0","lotte_rl_SVA2.run")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("lemlot2.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("lotte_rl_SVA3.run",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_drive("wikir",vectorizer,svd, dataset_keys, tfidf_matrix)

### Read

In [None]:
vectorizer, svd, dataset_keys, tfidf_matrix = FileManager.load_model_from_drive("lotte")