# Import Libraries

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import torch
import numpy as np
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("./wikir/csv/wikir.csv")
datasets = [dataset]
dataset_keys = list(datasets[-1].keys())

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()

dataset = {}
for i in tqdm(range(0,4)):
    dataset = dataset | FileManager.csv_to_dict(f"wikir/csv/RL{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

In [None]:
dataset = datasets[-1]
processed_dataset = {}
for key in dataset:
    processed_dataset[key] = WordCleaner.process_capital_punctuation(dataset[key])
datasets.append(processed_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
dataset = datasets[-1]
no_singles_dataset = {}
for key in tqdm(dataset):
    no_singles_dataset[key] = WordCleaner.remove_single_letters(dataset[key])
datasets.append(no_singles_dataset)

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

dataset = datasets[-1]
mapped_2 = {}

# Create a pool of workers
with Pool() as p:
    # Wrap your iterator (dataset) with tqdm for a progress bar
    for row in tqdm(dataset):
        # Apply the function to each word in the row in parallel
        mapped_2[row] = p.map(WordCleaner.get_unified_synonym_2, dataset[row])
datasets.append(mapped_2)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_keys = list(datasets[-1].keys())

#### LSA

In [None]:
svd = TruncatedSVD(n_components=6, algorithm="arpack")
lsa_matrix = Indexer.calculate_lsa(tfidf_matrix,svd)

In [None]:
import pandas as pd
terms = vectorizer.get_feature_names_out()

topics_dict = {}
for i, comp in enumerate(tqdm(svd.components_)):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    topics_dict[i] = sorted_terms

for key, value_list in topics_dict.items():
    if isinstance(value_list, list) and value_list:
        # Extract the first value from each tuple
        first_values = [tup[0] for tup in value_list]
        topics_dict[key] = " ".join(first_values)
        
# df = pd.DataFrame(topics_dict)
# df

In [None]:
# import umap
# import matplotlib.pyplot as plt
import pandas as pd

# embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(lsa_matrix)


from sklearn.decomposition import PCA
 
pca = PCA(2)
pca.fit(lsa_matrix)
 
pca_matrix = pca.transform(lsa_matrix)
pca_data = pd.DataFrame(pca.transform(lsa_matrix))
# print(pca_data.head())


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

# Scaling the data to normalize
model = KMeans(n_clusters=6).fit(pca_matrix)

fig, ax = plt.subplots(figsize=(28,20))
scatter = ax.scatter(pca_matrix[:, 0], pca_matrix[:, 1], 
c = model.labels_.astype(float),
s = 20, # size
edgecolor='none'
)

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="Topics")
ax.add_artist(legend1)

for t, l in zip(legend1.texts, list(topics_dict.values()),):
    t.set_text(l)

plt.show()

In [None]:
import numpy as np
np.save("lsa_matrixmapnew",lsa_matrix)
# embedding = np.load("embedding.np.npy")

# Query Manipulation 

## Manual Query

In [None]:
query = "she co founded the phillips collection with her husband duncan phillips she was born marjorie acker in bourbon indiana she was the sister to six other siblings her parents were charles ernest acker and alice beal she was raised in ossining new york phillips started drawing as a child her uncles were reynolds beal and gifford beal both men noticed phillips artistic ability and suggested she pursue art as a career path she began attending the art students league in 1915 and graduated in 1918 she studied under boardman robinson marjorie phillips has the unmistakable style of the born painter duncan phillips phillips is quoted as stating that she didn t want to paint depressing pictures she painted primarily landscapes and still life works despite living a socialite lifestyle alongside her husband phillips made the effort to paint every morning in her washington d c studio she attended an art exhibition for duncan phillips at the century association in january 1921 she met duncan and the two married in october of that year duncan was an art collector and the couple expanded their collecting phillips moved to washington d c and into duncan s dupont circle mansion duncan s mother"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
# query = WordCleaner.stem(query, 'Snowball')
query = WordCleaner.lemmatize(query)
# query = [WordCleaner.get_unified_synonym(word) for word in query]
print(query)

### Calculate TF-IDF

In [None]:
matrix = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)

In [None]:
matrix = Indexer.calculate_doc_lsa(matrix,svd)


### Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(lsa_matrix,matrix,dataset_keys,0.9)

for row in similar_rows.items():
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/testing/queries.csv")

### Lotte queries loader

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv",delimiter="\t")

### Text Processing

In [None]:
from multiprocessing import Pool

for key in queries.keys():
    # queries[key] = [WordCleaner.get_unified_synonym(word) for word in queries[key]]
    queries[key] = WordCleaner.remove_stop_words(queries[key])
#     queries[key] = WordCleaner.process_capital_punctuation(queries[key])
# with Pool() as p:
#     for row in tqdm(queries):
#         queries[row] = p.map(WordCleaner.get_unified_synonym_2, queries[row])
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
for key in queries.keys():
    queries[key] = WordCleaner.lemmatize(queries[key])
for key in queries.keys():
    queries[key] = WordCleaner.remove_single_letters(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in tqdm(queries.keys()):
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in tqdm(queries_matrices.keys()):
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_keys,0.35)

In [None]:
queriesAnswers = {}
for key in tqdm(queries_matrices.keys()):
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_keys,0.35)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/testing/qrels","TwikirNRML35.run")

In [None]:
Evaluater.evaluate("wikir/testing/qrels","TwikirRMLN35.run")


In [None]:
Evaluater.evaluate("wikir/testing/qrels","TwikirRML35.run")


# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("wikirRML.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("TwikirRMLN35.run",queries,queriesAnswers,max_relevance=2)

## Model

### Write

In [None]:
FileManager.write_model_to_drive("wikir_RMLN",vectorizer, dataset_keys, tfidf_matrix)

### Read

In [None]:
vectorizer, dataset_keys, tfidf_matrix = FileManager.load_model_from_drive("wikir_RMLN")

# Test

In [None]:
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

model = BertModel.from_pretrained('./model',ignore_mismatched_sizes=True)
tokenizer = BertTokenizer.from_pretrained('./model',ignore_mismatched_sizes=True)
dataset = datasets[-1]

In [None]:
queries = FileManager.csv_to_dict("wikir/testing/queries.csv")

In [None]:
tokenized_documents = {doc_id: tokenizer(' '.join(words), return_tensors='pt') for doc_id, words in tqdm(dataset.items())}

tokenized_queries = {query_id: tokenizer(' '.join(words), return_tensors='pt') for query_id, words in queries.items()}

In [None]:
query_vectors = {query_id: model(**words)[0][0][0] for query_id, words in tokenized_queries.items()}

In [None]:
document_vectors = {doc_id: model(**words)[0][0][0] for doc_id, words in tqdm(tokenized_documents.items())}

In [None]:
document_vector = {'1781133': model(**tokenized_documents["1781133"])[0][0][0]}
document_vector

In [None]:
import torch
device = torch.device('cuda')
model = model.to(device)

document_vectors = {}
batch_size = 512  # Adjust this based on your VRAM availability

doc_ids = list(tokenized_documents.keys())
doc_batches = [doc_ids[i:i + batch_size] for i in range(0, len(doc_ids), batch_size)]

for i,batch in enumerate(doc_batches):
    print(f"Processing batch {i} out of {len(doc_batches)}")
    batch_dict = {doc_id: tokenized_documents[doc_id] for doc_id in batch}
    with torch.no_grad():
        for doc_id, words in batch_dict.items():
            document_vectors[doc_id] = model(**words.to(device))[0][0][0]
            torch.cuda.empty_cache()  # Free up unused memory

In [None]:
# Convert BERT embeddings to 2D numpy arrays
document_vectors_np = {doc_id: doc_vector.detach().cpu().numpy().reshape(1, -1) for doc_id, doc_vector in document_vectors.items()}
query_vectors_np = {query_id: query_vector.detach().cpu().numpy().reshape(1, -1) for query_id, query_vector in query_vectors.items()}

# Create corpus_matrix and query_matrix
corpus_matrix = np.vstack(list(document_vectors_np.values()))
query_matrix = np.vstack(list(query_vectors_np.values()))

In [None]:
queries_answers = {}
for key in queries.keys():
    queries_answers[key] = Matcher.get_query_answers(corpus_matrix, query_vectors_np[key], dataset_keys, 0.85)

In [None]:
# Write to run file
FileManager.write_runfile_to_file('bert85.run', queries, queries_answers)

In [None]:
Evaluater.evaluate("wikir/testing/qrels","bert85.run")

In [None]:
# Manual Query
query_words = ["yanni"]

# Tokenize and convert your query to IDs
tokenized_query = tokenizer.tokenize(' '.join(query_words))
indexed_query = tokenizer.convert_tokens_to_ids(tokenized_query)

# Move indexed_query to the correct device and calculate the query vector
indexed_query = torch.tensor([indexed_query]).to(device)
query_vector = model(indexed_query)[0][0][0]

# Move query_vector to the CPU, detach it from the computation graph, and convert it to a numpy array
query_matrix = query_vector.detach().cpu().numpy().reshape(1, -1)

# Use your function to get the most similar documents
similar_docs = Matcher.get_query_answers_optimized(corpus_matrix, query_matrix, dataset_keys, 0.55)

# Print the IDs of the top 5 most similar documents
for i, (doc_id, score) in enumerate(list(similar_docs.items())[:10]):
    print(f"Rank {i+1}, Document ID: {doc_id}, Similarity Score: {score}")
