In [None]:
# Importing necessary libraries
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.gridspec as gridspec

# Loading the dataset
df = pd.read_excel('')
df = df.dropna(subset=[''])

# Loading pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenizing and encoding the dataset with attention mask
tokenized_texts = [tokenizer(text, truncation=True, max_length=512, padding='max_length', return_tensors='pt') for text in df['']]
input_ids = torch.cat([item['input_ids'] for item in tokenized_texts])
attention_masks = torch.cat([item['attention_mask'] for item in tokenized_texts])

# Creating DataLoader
batch_size = 32
data = TensorDataset(input_ids, attention_masks)
data_loader = DataLoader(data, batch_size=batch_size)

# Extracting embeddings
embeddings = []
model.eval()
with torch.no_grad():
    for batch in data_loader:
        input_ids, attention_mask = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        embeddings.append(batch_embeddings)
embeddings = torch.cat(embeddings, dim=0).numpy()

# Applying KMeans clustering
num_clusters = 10  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Calculating silhouette score
silhouette_avg = silhouette_score(embeddings, clusters)
print(f"Silhouette Score: {silhouette_avg}")

# Adding clusters to DataFrame
df['bert_cluster'] = clusters

# Using t-SNE to reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings)

# Adding t-SNE results to the DataFrame
df['tsne_x'], df['tsne_y'] = reduced_embeddings[:, 0], reduced_embeddings[:, 1]

# Plotting with t-SNE
plt.figure(figsize=(10, 6))
sns.scatterplot(x='tsne_x', y='tsne_y', hue='bert_cluster', data=df, palette='viridis', legend='full')
plt.title('BERT Clustering with t-SNE')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.show()

# Defining custom stopwords
custom_stopwords = {'done', 'a', 'for', 'i', 'the', 'expand', 'click', 'contain', 'spoiler', 'it', 'be', 'in', 'one', 'get', 'even', 'year', 'guess', 'see', 'got', 'feel', 'want', 'tell', 'absolute', 'every', 'is', 'some', 'would', 'else', 'de', 'said', 'us', 'by', 'little', 'decided', 'bethesda', 'let', 'must', 'gam', 'thousands', 'los', 'la', 'al', 'to', 'contains', 'of', 'ago', 'much', 'really', 'ever', 'games', 'played', 'bosses', 'go', 'like', 'good', 'say', 'lot', 'diego', 'que', 'give', 'review', 'reviews', 'people', 'everyone', 'never', 'per', 'boss', 'also', 'many', 'new', 'may', 'back', 'try', 'vet', 'made', 'make', 'could', 'spoilers', 'first', 'una', 'fps', 'not', 'find'}
additional_stopwords = {'thousands', 'los', 'la', 'al', 'to', 'contains', 'of', 'the', 'ago', 'much', 'really', 'ever', 'games', 'played', 'bosses', 'go', 'like', 'good', 'say', 'lot', 'diego', 'que', 'give', 'review', 'reviews', 'people', 'everyone', 'never', 'per', 'boss', 'also', 'many', 'new', 'may', 'back', 'try', 'vet', 'made', 'make', 'could', 'spoilers', 'first', 'una', 'fps', 'not', 'find'}
custom_stopwords.update(additional_stopwords)

# Initializing TF-IDF Vectorizer with custom stopwords
vectorizer = TfidfVectorizer(stop_words=list(custom_stopwords), max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df[''])

# Converting to array and getting feature names
tfidf_array = tfidf_matrix.toarray()
feature_names = np.array(vectorizer.get_feature_names_out())

# Function to get top tfidf words for each cluster
def get_top_tfidf_words_per_cluster(tfidf_array, feature_names, num_clusters, top_n=30):  
    top_words = {}
    for cluster_id in range(num_clusters):
        indices = np.where(df['bert_cluster'] == cluster_id)[0]
        mean_tfidf = np.mean(tfidf_array[indices], axis=0)
        top_indices = mean_tfidf.argsort()[-top_n:][::-1]
        top_words[cluster_id] = [(feature_names[i], mean_tfidf[i]) for i in top_indices]
    return top_words

top_tfidf_words = get_top_tfidf_words_per_cluster(tfidf_array, feature_names, num_clusters)

# Plotting Word Clouds for each BERT-based cluster
def plot_tfidf_word_clouds(top_words, num_clusters):
    fig = plt.figure(figsize=(15, 10))
    gs = gridspec.GridSpec(3, 4)
    gs.update(wspace=0.5, hspace=0.5)

    for cluster_id in range(num_clusters):
        words_scores = {word: score for word, score in top_words[cluster_id]}
        ax = fig.add_subplot(gs[cluster_id])
        wordcloud = WordCloud(width=200, height=200, background_color='white').generate_from_frequencies(words_scores)
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')
        ax.set_title(f'Cluster {cluster_id + 1}')

    plt.show()

plot_tfidf_word_clouds(top_tfidf_words, num_clusters)
