## Download Packages

In [None]:
# %pip install transformers
# !pip install torch==2.2.2

# %pip install country_converter

# %pip install spacy
# !python -m spacy download en_core_web_sm

# %pip install gensim

# %pip install roman
# %pip install re
# %pip install tqdm
# %pip install "torch-2.2.2+cu121-cp311-cp311-win_amd64.whl"
# %pip install bert-extractive-summarizer
# %pip install geocoder
# %pip install -U sentence-transformers
# %pip install streamlit
# %pip install wordcloud


# Import Libraries

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from python import FileManager
from python import Personalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import torch
from tqdm import tqdm
from multiprocessing import Pool
from gensim.models import Word2Vec

%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("wikir/RRM2RL.csv", delimiter=",", skip_headers=True)
datasets = [dataset]
dataset_keys = list(datasets[-1].keys())

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in tqdm(dataset):
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key], "spacy")
datasets.append(filtered_dataset)

## Remove single letters

In [None]:
dataset = datasets[-1]
no_singles_dataset = {}
for key in tqdm(dataset):
    no_singles_dataset[key] = WordCleaner.remove_single_letters(dataset[key])
datasets.append(no_singles_dataset)

## Process capital punctuation

In [None]:
dataset = datasets[-1]
processed_dataset = {}
for key in tqdm(dataset):
    processed_dataset[key] = WordCleaner.process_capital_punctuation(dataset[key])
datasets.append(processed_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in tqdm(dataset):
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in tqdm(dataset):
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
from multiprocessing import Pool

dataset = datasets[-1]
mapped = {}
# Create a pool of workers
with Pool() as p:
    for row in tqdm(dataset):
        # Apply the function to each word in the row in parallel
        mapped[row] = p.map(WordCleaner.get_unified_synonym_2, dataset[row])
datasets.append(mapped)

## Calculating tf-idf for the dataset

In [None]:
tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)

## Manual Query

In [None]:
query = "wikipedia"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
query = WordCleaner.remove_single_letters(query)
# query = WordCleaner.stem(query, 'Snowball')
query = [WordCleaner.get_unified_synonym_2(word) for word in query]
query = WordCleaner.remove_stop_words(query, "spacy")
query = WordCleaner.lemmatize(query)
print(query)

### Calculate TF-IDF and Cosine Similarity

In [None]:
matrix = Indexer.calculate_doc_tf_idf(query,vectorizer)

similar_rows = Matcher.get_query_answers(tfidf_matrix, matrix, dataset_keys, 0)

# Sort the items in the dictionary by value (i.e., rating) in descending order
sorted_rows = sorted(similar_rows.items(), key=lambda item: item[1], reverse=True)

for row in sorted_rows:
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv")

In [None]:
for key in queries.keys():
    queries[key] = WordCleaner.remove_stop_words(queries[key])
# for key in queries.keys():
#     queries[key] = WordCleaner.process_capital_punctuation(queries[key])
for key in queries.keys():
    queries[key] = WordCleaner.remove_single_letters(queries[key])
for key in queries.keys():
    queries[key] = [WordCleaner.get_unified_synonym_2(word) for word in queries[key]]
# for key in queries.keys():
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
for key in queries.keys():
    queries[key] = WordCleaner.remove_stop_words(queries[key], "spacy")
for key in queries.keys():
    queries[key] = WordCleaner.lemmatize(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in tqdm(queries.keys()):
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_keys,0.1)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_embedding_8_epoch_20_055.run", max_rel=2)

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_wiki_RRM2RL_01.run", max_rel=2)

In [None]:
Evaluater.evaluate("wikir/test/qrels","test runs/testrun_RRM2L_01_test.run", max_rel=2)

In [None]:
Evaluater.evaluate("wikir/test/qrels","test runs/testrun_embedding_6_epoch_20_055_test.run", max_rel=2)

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("wikir/RRM2RL.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("test runs/testrun_wiki_RRM2RL_01.run",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_drive("models/Model_RRM2LR",vectorizer, dataset_keys, tfidf_matrix)

### Read

In [None]:
vectorizer, dataset_keys, tfidf_matrix = FileManager.load_model_from_drive("models/Model_RRM2L")

## Word Embedding

In [None]:
dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = dataset.values()

In [None]:
model = Word2Vec(sentences=tokenized_documents,
                          vector_size=300,  # Dimensionality of the word vectors (100 is Good for a medium-sized dataset)
                          window=10,         # Maximum distance between the current and predicted word within a sentence ( 5 Balances local and broader context)
                          sg=1,             # Skip-Gram model (1 for Skip-Gram (can capture complex patterns), 0 for CBOW)
                          min_count=2,      # Ignores all words with a total frequency lower than this (2 is Low enough to not lose infrequent words)
                          workers = 10
                          )

In [None]:
model.train(tokenized_documents, total_examples=len(tokenized_documents), epochs=5)

In [None]:
documents_vectors = []
for doc in tqdm(tokenized_documents):
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in doc if token in model.wv]
    # Calculate the average vector for each document
    if valid_tokens:  # Check if there are any valid tokens
        doc_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        documents_vectors.append(doc_vector)
    else:
        # Handle documents with no valid tokens (e.g., empty documents)
        documents_vectors.append(np.zeros(model.vector_size))

# Convert to a 2D array
documents_vectors = np.array(documents_vectors)

In [None]:
FileManager.save_word2vec_model(model, "", documents_vectors, "")

In [None]:
model, documents_vectors = FileManager.load_word2vec_model("embedding_8_epoch_20.model", "embedding_8_docs_vecs.npy")

In [None]:
queries_matrices = {}
for key, query_tokens in queries.items():
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in query_tokens if token in model.wv]

    # Calculate the average vector for each query
    if valid_tokens:
        query_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        queries_matrices[key] = query_vector
    else:
        # Handle queries with no valid tokens
        print("Query with no valid tokens: " + key)
        queries_matrices[key] = np.zeros(model.vector_size)


In [None]:
queries_answers = {}

for key in tqdm(queries.keys()):
    # Reshape the query vector to 2D
    query_vector_2d = queries_matrices[key].reshape(1, -1)
    # Calculate answers for one query at a time
    queries_answers[key] = Matcher.get_query_answers(documents_vectors, query_vector_2d, dataset_keys, 0.6)


In [None]:
# Write to run file
FileManager.write_runfile_to_file('test runs/testrun_wiki_word2vec_8_epoch_20_06_max_2.run', queries, queries_answers)

## Manual Query

In [None]:
Matcher.get_word2vec_ans("wikiped", model, documents_vectors, dataset_keys)

## Personalization

In [None]:
Personalizer.clear_history()

In [None]:
Personalizer.get_ans_persona_word_2_vec("wikipedia", model, documents_vectors, dataset_keys)

## Clustering

In [None]:
from python import Clusterer

c = Clusterer.Clusterer(documents_vectors, 5)
c.getTopics(dataset, keys=dataset_keys)
c.plot(size=(21, 15))

## Tests

In [None]:
print(model.wv.most_similar(positive=['king'], negative=['man']))

## Plot similar words

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def display_pca_scatterplot(model, words=None, sample=0):
    if words is None:
        if sample > 0:
            words = np.random.choice(list(model.wv.index_to_key), sample)
        else:
            words = list(model.wv.index_to_key)
        
    word_vectors = np.array([model.wv[word] for word in words])

    # Determine the appropriate number of components (up to min(n_samples, n_features))
    n_components = min(word_vectors.shape[0], word_vectors.shape[1])

    if n_components > 1:
        twodim = PCA(n_components=n_components).fit_transform(word_vectors)[:, :2]
        plt.figure(figsize=(15, 9))
        plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
        for word, (x, y) in zip(words, twodim):
            plt.text(x + 0.05, y + 0.05, word)
        plt.show()
    else:
        print("Insufficient data for PCA visualization.")

# Example usage:
display_pca_scatterplot(model, ['kitchen', 'sink', 'bathroom', 'toilet', 'shower', 'rome', 'italy', 'milan', 'barcelona', 'madrid',
                                'king', 'queen', 'refrigerator', 'bathtub'])
