In [None]:
from utils.glove import Glove
from utils.tokenizer import MyTokenizer
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torch
from sklearn.manifold import TSNE
import plotly.express as pxb


In [None]:
documents = pd.read_csv('data/reviews_content.csv')
tokenizer = MyTokenizer(sentence_length=1000, case_sensitive=False)
tokenizer.fit(documents.content)

glove_vectors = Glove._load_glove_vectors("model/glove.6B/glove.6B.300d.txt")

embdd = Glove._get_data_embedding(tokenizer,glove_vectors)

In [None]:
embedding_weights = embdd.weight.data  # Shape: (num_embeddings, embedding_dim)

num_embeddings, embedding_dim = embedding_weights.shape
print("Original embedding shape:", embedding_weights.shape)

class EmbeddingAutoencoder(nn.Module):
    def __init__(self, input_dim, reduced_dim):
        super(EmbeddingAutoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 250),
            nn.ReLU(),
            nn.Linear(250, reduced_dim)  
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(reduced_dim, 250),
            nn.ReLU(),
            nn.Linear(250, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

reduced_dim = 200 
autoencoder = EmbeddingAutoencoder(input_dim=embedding_dim, reduced_dim=reduced_dim)

criterion = nn.MSELoss() 
optimizer = optim.Adam(autoencoder.parameters(), lr=1e-4)

dataset = TensorDataset(embedding_weights)
data_loader = DataLoader(dataset, batch_size=128, shuffle=True)

num_epochs = 200
for epoch in range(num_epochs):
    total_loss = 0
    for batch in data_loader:
        data = batch[0]
        encoded, decoded = autoencoder(data)
        
        loss = criterion(decoded, data)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(data_loader):.4f}")

with torch.no_grad():
    reduced_embeddings, _ = autoencoder(embedding_weights)

print("Reduced embedding shape:", reduced_embeddings.shape)

In [None]:
original_embeddings_np = embedding_weights.numpy()
reduced_embeddings_np = reduced_embeddings.numpy()

In [None]:
tsne_original = TSNE(n_components=2, random_state=42, perplexity=50, n_jobs=-1)
original_embeddings_2d = tsne_original.fit_transform(original_embeddings_np)

In [None]:
tsne_reduced = TSNE(n_components=2, random_state=42, perplexity=50, n_jobs=-1)
reduced_embeddings_2d = tsne_reduced.fit_transform(reduced_embeddings_np)

In [None]:
abs_embeddings = np.sum(reduced_embeddings_np**2, axis=1)
max_indices = np.argsort(abs_embeddings)[-100:]

In [None]:
# df_original = pd.DataFrame(original_embeddings_2d, columns=["x", "y"])
df_original = pd.DataFrame(original_embeddings_2d[max_indices], columns=["x", "y"])
df_original["Type"] = "Original"

# Reduced embeddings in 2D
# df_reduced = pd.DataFrame(reduced_embeddings_2d, columns=["x", "y"])
df_reduced = pd.DataFrame(reduced_embeddings_2d[max_indices], columns=["x", "y"])
df_reduced["Type"] = "Reduced"

# Combine for Plotly visualization
df = pd.concat([df_original, df_reduced])

# Plot with Plotly
fig = px.scatter(df, x="x", y="y", color="Type", title="t-SNE Visualization of Original and Reduced Embeddings", hover_data={'x': False, 'y': False})
fig.show()

In [None]:
# query = "A racing game with mercedes cars"
# query = "A puzzle game featuring portal and teleportation mechanics"
# query = "A game that combines elements of horror and educational content"

# documents = pd.read_csv('data/reviews_content.csv')
# tokenizer = MyTokenizer(sentence_length=1000, case_sensitive=False)
# tokenizer.fit(documents.content)

# q_embdd = Glove._get_data_embedding(tokenizer,glove_vectors)

In [None]:
def mean_pooling(vectors):
    vectors = np.array(vectors)
    return np.mean(vectors, axis=0)

In [None]:
sentence_embeddings = []
for phrase in documents.content:
    tokens = tokenizer(phrase)  
    vectors = [q_embdd.weight[token] for token in tokens]
    with torch.no_grad():
        for i in range(len(vectors)):  
            enhanced_embedding, _ = autoencoder(vectors[i])
            vectors[i] = mean_pooling(enhanced_embedding.detach().numpy())

    sentence_embeddings.append(vectors)
documents["sentence_embeddings"] = sentence_embeddings

In [None]:
tokens = tokenizer(query)  
query_embedding = [q_embdd.weight[token] for token in tokens]
with torch.no_grad():
    for i in range(len(vectors)):  
        enhanced_embedding, _ = autoencoder(query_embedding[i])
        query_embedding[i] = mean_pooling(enhanced_embedding.detach().numpy())

In [None]:
query_embedding = np.asarray(query_embedding)
sentence_embeddings = np.asarray(sentence_embeddings)

In [None]:
import torch.nn.functional as F
reduced_query_embedding = F.normalize(torch.from_numpy(np.asarray([query_embedding])), dim=1)
reduced_embeddings_normalized = F.normalize(torch.from_numpy(np.asarray(sentence_embeddings)), dim=1)

In [None]:
similarities = F.cosine_similarity(reduced_embeddings_normalized, reduced_query_embedding.unsqueeze(0), dim=-1)

In [None]:
top_k = 10
top_k_indices = torch.topk(similarities, top_k).indices[0]

In [None]:
(documents[
        [
            "title",
            "link",
        ]
    ]
    .iloc[top_k_indices]
    .fillna("")
    .to_dict(orient="records")
)