In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Machine Learning & Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.manifold import TSNE

# NLP & GenAI
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Configuration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option('display.max_colwidth', 150)

# Select Device (GPU if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [2]:
import os
# STRICTLY set this to 1 to prevent kernel crashes on clustering
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [3]:
# Load Data
file_path = '../dataset/Tweets.csv'

df = pd.read_csv(file_path)

print(f"Dataset Successfully Loaded: {df.shape[0]} rows.")

# Cleaning & Preprocessing 
def clean_tweet(text):
    if not isinstance(text, str): return ""
    # 1. Change to lowercase
    text = text.lower()
    # 2. Remove mentions (for e.g, @United)
    text = " ".join([word for word in text.split() if not word.startswith('@')])
    # 3. Remove whitespace
    return text.strip()

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Filter out empty tweets after cleaning
df = df[df['cleaned_text'].str.len() > 5].reset_index(drop=True)

df_sample = df.copy()

print(f"Data Cleaned & Ready. Working with {len(df_sample)} tweets.")
display(df_sample[['airline_sentiment', 'cleaned_text']].head())

Dataset Successfully Loaded: 14640 rows.
Data Cleaned & Ready. Working with 14601 tweets.


Unnamed: 0,airline_sentiment,cleaned_text
0,neutral,what said.
1,positive,plus you've added commercials to the experience... tacky.
2,neutral,i didn't today... must mean i need to take another trip!
3,negative,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,negative,and it's a really big bad thing about it


In [4]:
print("Loading SBERT model...")
# 'all-MiniLM-L6-v2' is optimized for speed and performance
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("Generating embeddings for all tweets...")
embeddings = embedder.encode(df_sample['cleaned_text'].tolist(), show_progress_bar=True)

print(f"Embedding Shape: {embeddings.shape}") 

Loading SBERT model...
Generating embeddings for all tweets...


Batches:   0%|          | 0/457 [00:00<?, ?it/s]

Embedding Shape: (14601, 384)


In [5]:
# Reduce dimensions to preserve 95% variance
pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
embeddings_pca = pca.fit_transform(embeddings)

print(f"Original Dimensions: {embeddings.shape[1]}")
print(f"Reduced Dimensions: {embeddings_pca.shape[1]}")
print(f"Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_):.2f}")

Original Dimensions: 384
Reduced Dimensions: 246
Explained Variance Ratio: 0.95


In [12]:
# This value was determined after a lot of analysis
num_clusters = 17

print(f"Running KMeans with {num_clusters} clusters...")
kmeans = KMeans(n_clusters=num_clusters, random_state=RANDOM_STATE, n_init=10)
kmeans_labels = kmeans.fit_predict(embeddings_pca)

# Add labels to DataFrame
df_sample['kmeans_cluster'] = kmeans_labels

# Evaluation Metrics
sil_score = silhouette_score(embeddings_pca, kmeans_labels)
db_score = davies_bouldin_score(embeddings_pca, kmeans_labels)

print("KMeans Results")
# The higher the silhoutte score, the better, while opposite is true for Davies-Boudin score.
print(f"Silhouette Score: {sil_score:.4f}")
print(f"Davies-Bouldin Score: {db_score:.4f}")

# Show distribution of clusters
print("\nCluster Distribution:")
print(df_sample['kmeans_cluster'].value_counts().sort_index())

Running KMeans with 17 clusters...
KMeans Results
Silhouette Score: 0.0228
Davies-Bouldin Score: 4.0632

Cluster Distribution:
kmeans_cluster
0      879
1     1155
2     1045
3      848
4     1158
5      218
6      704
7      935
8      817
9     1164
10     148
11     463
12     930
13     730
14     745
15    1092
16    1570
Name: count, dtype: int64
