# Import Libraries

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from python import Clusterer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import torch
import numpy as np
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("wikir/csv/wikir.csv")
datasets = [dataset]
dataset_keys = list(datasets[-1].keys())

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()

dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"lotte/lemlot{i}.csv")
datasets = [dataset]
dataset_keys = list(datasets[-1].keys())

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

In [None]:
dataset = datasets[-1]
no_singles_dataset = {}
for key in tqdm(dataset):
    no_singles_dataset[key] = WordCleaner.remove_single_letters(dataset[key])
datasets.append(no_singles_dataset)

In [None]:
dataset = datasets[0]
processed_dataset = {}
for key in tqdm(dataset):
    x = WordCleaner.process_capital_punctuation(dataset[key])
    if len(x) < 2:
        processed_dataset[key] = " ".join(x) + ' 0'
    else:
        processed_dataset[key] = " ".join(x)
datasets.append(processed_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

dataset = datasets[-1]
mapped_2 = {}

# Create a pool of workers
with Pool() as p:
    # Wrap your iterator (dataset) with tqdm for a progress bar
    for row in tqdm(dataset):
        # Apply the function to each word in the row in parallel
        mapped_2[row] = p.map(WordCleaner.get_unified_synonym_2, dataset[row])
datasets.append(mapped_2)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)

## Clustering

In [None]:
c = Clusterer.Clusterer(tfidf_matrix,6)
c.plot(size=(21, 15),topics=c.getTopics(datasets[-1],keys=dataset_keys))

In [None]:
import enthought.mayavi.mlab as mylab
x, y, z, value = np.random.random((4, 40))
mylab.points3d(x, y, z, value)
mylab.show()

# Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/testing/queries.csv")

### Lotte queries loader

In [None]:
queries = FileManager.csv_to_dict("lotte/queries.tsv",delimiter="\t") 

### Text Processing

In [None]:
from multiprocessing import Pool

for key in queries.keys():
    # queries[key] = [WordCleaner.get_unified_synonym(word) for word in queries[key]]
    queries[key] = WordCleaner.remove_stop_words(queries[key])
    queries[key] = WordCleaner.process_capital_punctuation(queries[key])
# with Pool() as p:
#     for row in tqdm(queries):
#         queries[row] = p.map(WordCleaner.get_unified_synonym_2, queries[row])
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
for key in queries.keys():
    queries[key] = WordCleaner.lemmatize(queries[key])
# for key in queries.keys():
    # queries[key] = WordCleaner.remove_single_letters(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in tqdm(queries.keys()):
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in tqdm(queries_matrices.keys()):
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_keys,0.35)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/testing/qrels","TwikirRML35.run")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("wikirRML.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("Tlotte35.run",queries,queriesAnswers,max_relevance=2)

## Model

### Write

In [None]:
FileManager.write_model_to_drive("wikir_RMLN",vectorizer, dataset_keys, tfidf_matrix)

### Read

In [None]:
vectorizer, dataset_keys, tfidf_matrix = FileManager.load_model_from_drive("lotteLSA")

# Test Sentence-Transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
dataset = datasets[-1]
embeddings = []
keys = list(dataset.keys())
values = list(dataset.values())
batch_size = 10000  # Adjust this value based on your memory capacity

for i in tqdm(range(0, len(keys), batch_size)):
    batch_keys = keys[i:i+batch_size]
    batch_values = values[i:i+batch_size]
    batch_embeddings = model.encode(batch_values)
    for embedding in batch_embeddings:
        embeddings.append(embedding)

In [None]:
import numpy
# corpus_matrix = np.array(embeddings)
# numpy.save("wikir_sent_embed",corpus_matrix)
corpus_matrix = numpy.load("lotte_sent_embed2.npy")

In [None]:
dataset_keys = FileManager.load_keys("lotteLSA")

In [None]:
queries = FileManager.csv_to_dict("lotte/queries1.tsv",delimiter="\t",skip=False)
embeddingsQ = {}
for key in tqdm(queries.keys()):
    embeddingsQ[key] = model.encode(" ".join(queries[key]))
    embeddingsQ[key] = embeddingsQ[key].reshape(1,-1)

In [None]:
queries_answers = {}
for key in tqdm(queries.keys()):
    queries_answers[key] = Matcher.get_query_answers(corpus_matrix, embeddingsQ[key], dataset_keys, 0.5)

In [None]:
# Write to run file
FileManager.write_runfile_to_file('2Flottesent50.run', queries, queries_answers)

In [None]:
c = Clusterer.Clusterer(corpus_matrix,6)
c.plot(size=(21, 15),topics=c.getTopics(datasets[-1],keys=dataset_keys))