In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

In [None]:
from datasets import get_dataset
from glove import load_glove_vectors, get_sentence_embedding
from training import train

In [None]:
sample_size = 1000
df = get_dataset("../games_apurado.json", sample_size = sample_size)

df.head()

In [None]:
# !!!! Change this to where glove is located in your computer !!!!
# If you don't have it, check file downloadglove.py
glove_file = "../glove.6B/glove.6B.300d.txt"
glove_vectors = load_glove_vectors(glove_file)

In [None]:
embeddings_list = []
for text in tqdm(df['description_cleaned']):
    embedding = get_sentence_embedding(text, glove_vectors)
    embeddings_list.append(embedding)

embeddings_matrix = torch.stack(embeddings_list)

In [None]:
num_epochs = 60  # !!!! Configure for how many epochs you want !!!!
autoencoder, embeddings_transformed, losses = train(embeddings_matrix, input_dim = 300, hidden_dim = 100, num_epochs = num_epochs)

In [None]:
plt.plot(losses)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.show()

In [None]:
from sklearn.cluster import KMeans

embeddings_numpy = embeddings_matrix.detach().numpy()
embeddings_transformed_numpy = embeddings_transformed.detach().numpy()

tsne_original = TSNE(n_components=2, random_state=42)
X_tsne_embeddings_original = tsne_original.fit_transform(embeddings_numpy)

tsne_new = TSNE(n_components=2, random_state=42)
X_tsne_embeddings_transformed = tsne_new.fit_transform(embeddings_transformed_numpy)

n_clusters = 5  # !!! Configure for the number of clusters you want / believe to exist !!!
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters_original = kmeans.fit_predict(embeddings_numpy)

In [None]:
df_original = pd.DataFrame({
    'TSNE_1': X_tsne_embeddings_original[:, 0],
    'TSNE_2': X_tsne_embeddings_original[:, 1],
    'Cluster': clusters_original,
    'Name': df['name'],
    'Description': [desc[:100] + '...' for desc in df['description']] 
})

df_transformed = pd.DataFrame({
    'TSNE_1': X_tsne_embeddings_transformed[:, 0],
    'TSNE_2': X_tsne_embeddings_transformed[:, 1],
    'Cluster': clusters_original,
    'Name': df['name'],
    'Description': [desc[:100] + '...' for desc in df['description']]
})

fig_original = px.scatter(
    df_original,
    x='TSNE_1', y='TSNE_2',
    color='Cluster',
    hover_data={'Name': True, 'Description': True, 'Cluster': True},
    title="TSNE of Initial Embeddings"
)
fig_original.show()

fig_transformed = px.scatter(
    df_transformed,
    x='TSNE_1', y='TSNE_2',
    color='Cluster',
    hover_data={'Name': True, 'Description': True, 'Cluster': True},
    title="TSNE of Transformed Embeddings"
)
fig_transformed.show()

In [None]:
input_text = "not bloody"
input_embedding = get_sentence_embedding(input_text, glove_vectors)

input_transformed = autoencoder.encoder(input_embedding.unsqueeze(0))

cosine_similarities = F.cosine_similarity(input_transformed, embeddings_transformed).squeeze()
cosine_similarities = cosine_similarities.detach().cpu().numpy()

# !!!! Configure for however many results you want !!!!
num_search = 10
top_indices = np.argsort(-cosine_similarities)[:num_search]

# Print the results
print(f"Query: {input_text}\nResults:\n")
for i, idx in enumerate(top_indices, start=1):
    if cosine_similarities[idx] < 0.8:
        break
    similarity = cosine_similarities[idx]
    description = df.iloc[idx]['description']
    name = df.iloc[idx]['name']
    print(f"Top {i} - similarity: {similarity:.5f}")
    print(f"Name: {name}")
    print(f"Description: {description}\n")