# Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
import Matcher
import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("wikir/stemmed4.csv")
datasets = [dataset]

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()

dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"wikir/stemmed{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
dataset = datasets[-1]
# Build dictionary for the dataset
synonym_dict = {}
for key, words in dataset.items():
    for word in words:
        synonym_dict[word] = WordCleaner.get_unified_synonym(word)
print(synonym_dict)
# TODO save the dictionary locally because it is needed in query mapping, OR check if this may not be needed

# Update the dataset with alternative words
mapped_dataset = {}
for key, words in dataset.items():
    mapped_dataset[key] = [synonym_dict[word] for word in words]
datasets.append(mapped_dataset)
print(mapped_dataset)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_key = list(datasets[-1].keys())
# import pandas as pd
# tfidf_matrix
# df = pd.DataFrame(
#         tfidf_matrix.toarray(),
#         columns=vectorizer.get_feature_names_out(),
#         index=datasets[-1].keys(),
#     )

# df

## Calculate Cosine Sim

In [None]:
# TODO should we delete this cell?
related_docs = Matcher.calculate_cos_similarity(df,df.iloc[8])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

# Query Manipulation 

## Enter query

### Manual Query

In [None]:
query = "hello sister where is your hijab"
query = word_tokenize(query)
print(query)
query = WordCleaner.remove_stop_words(query)
print(query)
query = WordCleaner.stem(query, 'Snowball')
print(query)
# query = WordCleaner.lemmatize(query)
# print(query)
query = [WordCleaner.get_unified_synonym(word) for word in query]
print(query)

### From File

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv")

In [None]:
# TODO get unified synonym needs testing
# for key, words in queries.keys():
#     queries[key] = [synonym_dict[word] for word in words]
for key in queries.keys():
    queries[key] = WordCleaner.stem(queries[key], "Snowball")
# for key in queries.keys():
#     queries[key] = WordCleaner.lemmatize(queries[key])

## Calculate TF-IDF

In [None]:
qdf = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)
qdf

### From File

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

## Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(tfidf_matrix,qdf,dataset_key,0.25)

for row in similar_rows.items():
    print(row)

### From File

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_key,0.25)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","testrun","[nDCG @ 10, P @ 5, P(rel=2) @ 5, Judged @ 10]")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("stemmed255.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("testrun",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_file("model.npz",tfidf_matrix)

### Read

In [None]:
ddf = FileManager.load_model_from_file("model.npz")
ddf

## Word Embedding

In [None]:
# %pip install gensim

In [None]:
from gensim.models import Word2Vec

dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = list(dataset.values())

model = Word2Vec(sentences=tokenized_documents,
                          vector_size=100,  # Dimensionality of the word vectors
                          window=5,         # Maximum distance between the current and predicted word within a sentence
                          sg=1,             # Skip-Gram model (1 for Skip-Gram, 0 for CBOW)
                          min_count=1,      # Ignores all words with a total frequency lower than this
                          workers=4         # Number of CPUS to use
                          )       

# print(tokenized_documents[5:10])


# Train the model
model.train(tokenized_documents, total_examples=len(tokenized_documents), epochs=30)

model.save("embedding.model")
loaded_model = Word2Vec.load("embedding.model")

In [None]:
# Get the list of words (vocabulary) from the Word2Vec model
words = model.wv.index_to_key
print(len(words))
print(words)


# print(model.wv['malaysia'])
# print(model.wv.similarity('1st', 'First'))

print(model.wv.similarity('good', 'best'))
print(model.wv.similarity('good', 'malaysia'))
print(model.wv.most_similar('war'))

In [None]:
document_vectors = []
for doc in tokenized_documents:
    # Calculate the average vector for each document
    doc_vector = np.mean([model.wv[word] for word in doc if word in model.wv], axis=0)
    document_vectors.append(doc_vector)
# print(doc_vector)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Create a query vector for "the united states"
# You can calculate the average vector for the words in your query
query_words = ["machine"]
query_vector = np.mean([model.wv[word] for word in query_words if word in model.wv], axis=0)

# Compute cosine similarity between query and document vectors
similarity_scores = cosine_similarity([query_vector], document_vectors)

# Rank documents based on similarity scores
sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

# Print the ranked documents
for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
    print(f"Rank {rank}: Document {doc_id + 2} (Similarity Score = {score:.4f})")


In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def display_pca_scatterplot(model, words=None, sample=0):
    if words is None:
        if sample > 0:
            words = np.random.choice(list(model.wv.index_to_key), sample)
        else:
            words = list(model.wv.index_to_key)
        
    word_vectors = np.array([model.wv[word] for word in words])

    # Determine the appropriate number of components (up to min(n_samples, n_features))
    n_components = min(word_vectors.shape[0], word_vectors.shape[1])

    if n_components > 1:
        twodim = PCA(n_components=n_components).fit_transform(word_vectors)[:, :2]
        plt.figure(figsize=(6, 6))
        plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
        for word, (x, y) in zip(words, twodim):
            plt.text(x + 0.05, y + 0.05, word)
        plt.show()
    else:
        print("Insufficient data for PCA visualization.")

# Example usage:
display_pca_scatterplot(model, ['machine','yoga', 'war', 'good'])


## Tests