# Import Libraries

In [None]:
# %pip install transformers
# !pip install torch==2.2.2

# %pip install country_converter

# %pip install spacy
# !python -m spacy download en_core_web_sm

# %pip install roman
# %pip install re
%pip install tqdm

In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import torch
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
# vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("wikir/RM_NEW_L.csv")
datasets = [dataset]

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()

dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"wikir/R{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

dataset = datasets[-1]
mapped_2 = {}

# Create a pool of workers
with Pool() as p:
    # Wrap your iterator (dataset) with tqdm for a progress bar
    for row in tqdm(dataset):
        # Apply the function to each word in the row in parallel
        mapped_2[row] = p.map(WordCleaner.get_unified_synonym_2, dataset[row])
datasets.append(mapped_2)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
# tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_keys = list(datasets[-1].keys())

# Query Manipulation 

## Manual Query

In [None]:
query = "hello sister where is your hijab"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
query = WordCleaner.stem(query, 'Snowball')
# query = WordCleaner.lemmatize(query)
query = [WordCleaner.get_unified_synonym(word) for word in query]
print(query)

### Calculate TF-IDF

In [None]:
matrix = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)

### Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(tfidf_matrix,matrix,dataset_key,0.25)
for row in similar_rows.items():
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/queries.csv")

In [None]:
# TODO get unified synonym needs testing
for key in queries.keys():
    queries[key] = WordCleaner.remove_stop_words(queries[key])
# for key in queries.keys():
#     queries[key] = [WordCleaner.get_unified_synonym(word) for word in queries[key]]
# for key in queries.keys():
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
# for key in queries.keys():
#     queries[key] = WordCleaner.lemmatize(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_key,0.25)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_r_l")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_3")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_3_epoch_2")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_3_epoch_4")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_3_epoch_4_075")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_4_epoch_1_06")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_4_epoch_3_065")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_wiki_news")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_embedded_4_epoch_5_06")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_wiki_news_06")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_wiki_news_075")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_wiki_news_065")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("map1.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("testrun_embedded",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_file("model.npz",tfidf_matrix)

### Read

In [None]:
ddf = FileManager.load_model_from_file("model.npz")
ddf

# Word Embedding

In [None]:
# %pip install gensim

In [None]:
from gensim.models import Word2Vec

dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = dataset.values()
model = Word2Vec(sentences=tokenized_documents,
                          vector_size=150,  # Dimensionality of the word vectors (100 is Good for a medium-sized dataset)
                          window=7,         # Maximum distance between the current and predicted word within a sentence ( 5 Balances local and broader context)
                          sg=1,             # Skip-Gram model (1 for Skip-Gram (can capture complex patterns), 0 for CBOW)
                          min_count=2,      # Ignores all words with a total frequency lower than this (2 is Low enough to not lose infrequent words)
                          workers=4         # Number of CPUS to use
                          )

In [None]:
model.save("embedding_4_epoch_5.model")

In [None]:
# Load a saved model
from gensim.models import Word2Vec

dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = dataset.values()
model = Word2Vec.load("embedding_4_epoch_3.model")


In [None]:
# Train the model
model.train(tokenized_documents, total_examples=len(tokenized_documents), epochs=1)

In [None]:
import numpy as np

document_vectors = []
for doc in tokenized_documents:
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in doc if token in model.wv]
    # Calculate the average vector for each document
    if valid_tokens:  # Check if there are any valid tokens
        doc_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        document_vectors.append(doc_vector)
    else:
        # Handle documents with no valid tokens (e.g., empty documents)
        document_vectors.append(np.zeros(model.vector_size))

# Convert to a 2D array
document_vectors = np.array(document_vectors)


In [None]:
queries_matrices = {}
for key, query_tokens in queries.items():
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in query_tokens if token in model.wv]

    # Calculate the average vector for each query
    if valid_tokens:
        query_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        queries_matrices[key] = query_vector
    else:
        # Handle queries with no valid tokens
        print("Query with no valid tokens: " + key)
        queries_matrices[key] = np.zeros(model.vector_size)


In [None]:
from scipy import sparse
corpus_matrix_sparse = sparse.csr_matrix(document_vectors)
queries_answers_embedded = {}

for key in queries.keys():
    # Reshape the query vector to 2D
    query_vector_2d = queries_matrices[key].reshape(1, -1)
    # Calculate answers for one query at a time
    single_query_answer = Matcher.get_query_answers_optimized(corpus_matrix_sparse, query_vector_2d, dataset_key, 0.65)
    queries_answers_embedded[key] = single_query_answer
    # Optionally, clear the single_query_answer from memory if needed
    del single_query_answer


In [None]:
# Write to run file
FileManager.write_runfile_to_file('testrun_wiki_news_065', queries, queries_answers_embedded)

In [None]:
# Manual query

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Create a query vector for "tokyo disney resort"
query_words = ["wikipedia"]
valid_vectors = [model[word] for word in query_words if word in model]

# Check if there are valid vectors to avoid nan issues
if valid_vectors:
    query_vector = np.mean(valid_vectors, axis=0).reshape(1, -1)
    # Compute cosine similarity between query and document vectors
    similarity_scores = cosine_similarity(query_vector, document_vectors)

    # Rank documents based on similarity scores
    sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

    # Print the ranked documents
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        print(f"Rank {rank}: Document {doc_id + 2} (Similarity Score = {score:.4f})")
else:
    print("None of the query words were found in the model's vocabulary.")


## Personalization

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

query_1 = ["apple"]
vector_1 = [model.wv[word] for word in query_1 if word in model.wv]

query_2 = ["tech"]
vector_2 = [model.wv[word] for word in query_2 if word in model.wv]

# Check if there are valid vectors to avoid nan issues
if vector_1:
    query_vector_1 = np.mean(vector_1, axis=0).reshape(1, -1)
    query_vector_2 = np.mean(vector_2, axis=0).reshape(1, -1)

    # Assuming query_vector_1 and query_vector_2 are already defined as 2D arrays
    weighted_vector = 0.75 * query_vector_1 + 0.25 * query_vector_2
    # Compute cosine similarity between query and document vectors
    similarity_scores = cosine_similarity(weighted_vector, document_vectors)
    # Rank documents based on similarity scores
    sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

    # Print the ranked documents
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        print(f"Rank {rank}: Document {doc_id + 2} (Similarity Score = {score:.4f})")
else:
    print("None of the query words were found in the model's vocabulary.")

## Tests

In [None]:
# Get the list of words (vocabulary) from the Word2Vec model
words = model.wv.index_to_key
print(len(words))
print(words)

# print(model.wv['malaysia'])
# print(model.wv.similarity('1st', 'First'))

print(model.wv.similarity('world', 'war'))
print(model.wv.similarity('good', 'malaysia'))
print(model.wv.most_similar('war'))

## Plot

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def display_pca_scatterplot(model, words=None, sample=0):
    if words is None:
        if sample > 0:
            words = np.random.choice(list(model.wv.index_to_key), sample)
        else:
            words = list(model.wv.index_to_key)
        
    word_vectors = np.array([model.wv[word] for word in words])

    # Determine the appropriate number of components (up to min(n_samples, n_features))
    n_components = min(word_vectors.shape[0], word_vectors.shape[1])

    if n_components > 1:
        twodim = PCA(n_components=n_components).fit_transform(word_vectors)[:, :2]
        plt.figure(figsize=(6, 6))
        plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
        for word, (x, y) in zip(words, twodim):
            plt.text(x + 0.05, y + 0.05, word)
        plt.show()
    else:
        print("Insufficient data for PCA visualization.")

# Example usage:
display_pca_scatterplot(model, ['battalion','world', 'war', 'good', 'best', 'state', 'government', 'university', 'college', 'germany', 'german', '12', 'twelve'])


## Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming 'documents' is a list of preprocessed documents
tagged_data = [TaggedDocument(words=datasets[-1].values(), tags=[i]) for i, doc in enumerate(documents)]

# Build and train the model
model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=1)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


# define a list of documents.
data = ["This is the first document",
        "This is the second document",
        "This is the third document",
        "This is the fourth document"]

# get the document vectors
document_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in data]
# Compute cosine similarity between query and document vectors
document_vectors = [model.dv[i] for i in range(len(documents))]

# Infer a vector for a new document
query_vector = model.infer_vector(["tokyo", "disney", "resort"])

 
#  print the document vectors
for i, doc in enumerate(data):
    print("Document", i+1, ":", doc)
    print("Vector:", document_vectors[i])
    print()


similarity_scores = cosine_similarity([query_vector], document_vectors)

# Use your get_query_answers function to retrieve relevant documents
relevant_docs = Matcher.get_query_answers(document_vectors, [query_vector], dataset_key, 0.6)


## Wiki News Model

In [None]:
# wiki model
from gensim.models import KeyedVectors

# print(list(gensim.downloader.info()['models'].keys()))
# Load FastText model
model = KeyedVectors.load('wiki_news.model')

In [None]:
# wiki model 1
import numpy as np
dataset = datasets[-1]

# Convert to a list of tokenized documents
tokenized_documents = dataset.values()

# Calculate document vectors
document_vectors = []
for doc in tokenized_documents:
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in doc if token in model]
    # Calculate the average vector for each document
    if valid_tokens:  # Check if there are any valid tokens
        doc_vector = np.mean([model[token] for token in valid_tokens], axis=0)
        document_vectors.append(doc_vector)
    else:
        # Handle documents with no valid tokens (e.g., empty documents)
        document_vectors.append(np.zeros(model.vector_size))

# Convert to a 2D array
document_vectors = np.array(document_vectors)

In [None]:
# wiki model
queries_matrices = {}
for key, query_tokens in queries.items():
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in query_tokens if token in model]

    # Calculate the average vector for each query
    if valid_tokens:
        query_vector = np.mean([model[token] for token in valid_tokens], axis=0)
        queries_matrices[key] = query_vector
    else:
        # Handle queries with no valid tokens
        print("Query with no valid tokens: " + key)
        queries_matrices[key] = np.zeros(model.vector_size)


In [None]:
pip install transformers

In [None]:
pip uninstall torch

In [None]:
from transformers import BertModel

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
pip install torch --index-url https://download.pytorch.org/whl/cu118

In [None]:
try:
    import transformers
    print("Transformers is installed")
except ImportError:
    print("Transformers is not installed")

try:
    import torch
    print("PyTorch is installed")
except ImportError:
    print("PyTorch is not installed")
