# 1. Reading Dataset

In [None]:
import pandas as pd
import time
import os

base_dir = '../dataset/phase 3/'
file_path = os.path.join(base_dir, 'topic_modelling_dataset.csv')

df = pd.read_csv(file_path)

# 2. Vectorization using DistilBERT

In [None]:
!pip install -q transformers

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.cluster import KMeans
import numpy as np
import torch
import time

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embeddings_batched(sentences, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        batch_inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            batch_outputs = model(**batch_inputs)
        batch_embeddings = batch_outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

start_time = time.time()
embeddings = get_bert_embeddings_batched(df['content'].tolist(), batch_size=32)
end_time = time.time()

time_taken = end_time - start_time
print(f"Time taken for vectorization: {time_taken} seconds")


# 3. K-Means Clustering
## 3.1. Searching Best K with ELbow Method

In [None]:
import matplotlib.pyplot as plt
import time
from sklearn.cluster import KMeans

# Range of number of clusters to try
cluster_range = range(1, 11)

# Variables to store results
wcss = []
execution_times = []

for n_clusters in cluster_range:
    start_time = time.time()
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, random_state=42)
    kmeans.fit(embeddings)
    end_time = time.time()

    # Record WCSS and execution time
    wcss.append(kmeans.inertia_)
    execution_times.append(end_time - start_time)

# Plotting WCSS (Elbow Method) and Execution Times
plt.figure(figsize=(12, 6))

# Subplot for WCSS (Elbow Method)
plt.subplot(1, 2, 1)
plt.plot(cluster_range, wcss, marker='o')
plt.title('Elbow Method For Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')

# Subplot for Execution Times
plt.subplot(1, 2, 2)
plt.plot(cluster_range, execution_times, marker='o', color='red')
plt.title('Execution Time for Each Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Execution Time (seconds)')

plt.tight_layout()
plt.show()


## 3.2. Training with Selected K

In [None]:
import warnings
warnings.filterwarnings('ignore')

kmeans = KMeans(n_clusters=10, init='k-means++', n_init=10, max_iter=300, random_state=42)
kmeans.fit(embeddings)

df['cluster'] = kmeans.labels_

## 3.3. Cluster Review Summarization

In [None]:
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Function to summarize text for a cluster
def summarize_cluster(cluster_text, max_length=500, min_length=250):
    return summarizer(cluster_text, max_length=max_length, min_length=min_length, truncation=True)[0]['summary_text']

# Summarize each cluster
num_clusters = 10 
cluster_summaries = {}
for i in range(num_clusters):
    start_time = time.time()
    cluster_text = ' '.join(df[df['cluster'] == i]['content'])
    cluster_summaries[i] = summarize_cluster(cluster_text)
    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for summarization: {time_taken} seconds")

# Print summaries
for cluster, summary in cluster_summaries.items():
    print(f"Cluster {cluster} Summary:\n{summary}\n")
