In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Machine Learning & Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.manifold import TSNE

# NLP & GenAI
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# # Configuration
# RANDOM_STATE = 42
# np.random.seed(RANDOM_STATE)
# pd.set_option('display.max_colwidth', 150)

# # Select Device (GPU if available, else CPU)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import os
# STRICTLY set this to 1 to prevent kernel crashes on clustering
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [None]:
# Load Data
file_path = '../dataset/Tweets.csv'

df = pd.read_csv(file_path)

print(f"Dataset Successfully Loaded: {df.shape[0]} rows.")

# Cleaning & Preprocessing 
def clean_tweet(text):
    if not isinstance(text, str): return ""
    # 1. Change to lowercase
    text = text.lower()
    # 2. Remove mentions (for e.g, @United)
    text = " ".join([word for word in text.split() if not word.startswith('@')])
    # 3. Remove whitespace
    return text.strip()

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Filter out empty tweets after cleaning
df = df[df['cleaned_text'].str.len() > 5].reset_index(drop=True)

df_sample = df.copy()

print(f"Data Cleaned & Ready. Working with {len(df_sample)} tweets.")
display(df_sample[['airline_sentiment', 'cleaned_text']].head())

In [None]:
print("Loading SBERT model...")
# 'all-MiniLM-L6-v2' is optimized for speed and performance
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("Generating embeddings for all tweets...")
embeddings = embedder.encode(df_sample['cleaned_text'].tolist(), show_progress_bar=True)

print(f"Embedding Shape: {embeddings.shape}") 

In [None]:
# Reduce dimensions to preserve 95% variance
pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
embeddings_pca = pca.fit_transform(embeddings)

print(f"Original Dimensions: {embeddings.shape[1]}")
print(f"Reduced Dimensions: {embeddings_pca.shape[1]}")
print(f"Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_):.2f}")

In [None]:
# Determining best K value for K Means
results = []
for k in range(2, 31):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(embeddings_pca)

    sil = silhouette_score(embeddings_pca, labels)
    db = davies_bouldin_score(embeddings_pca, labels)
    inertia = km.inertia_

    results.append((k, sil, db, inertia))

    print(f"k={k}: silhouette={sil:.4f}, db={db:.4f}, inertia={inertia:.2f}")

ks = [r[0] for r in results]
sil_scores = [r[1] for r in results]
db_scores = [r[2] for r in results]
inertias = [r[3] for r in results]

plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
plt.plot(ks, sil_scores)
plt.title("Silhouette Score")
plt.xlabel("k")

plt.subplot(1,3,2)
plt.plot(ks, db_scores)
plt.title("Davies-Bouldin Score")
plt.xlabel("k")

plt.subplot(1,3,3)
plt.plot(ks, inertias)
plt.title("Elbow (Inertia)")
plt.xlabel("k")

plt.tight_layout()
plt.show()

# Max silhouette
best_sil_k = ks[np.argmax(sil_scores)]

# Min DB index
best_db_k = ks[np.argmin(db_scores)]

print(f"Best k by Silhouette Score: {best_sil_k}")
print(f"Best k by Daviesâ€“Bouldin Score: {best_db_k}")


In [None]:
# This value was determined after the analysis above
num_clusters = 17

print(f"Running KMeans with {num_clusters} clusters...")
kmeans = KMeans(n_clusters=num_clusters, random_state=RANDOM_STATE, n_init=10)
kmeans_labels = kmeans.fit_predict(embeddings_pca)

# Add labels to DataFrame
df_sample['kmeans_cluster'] = kmeans_labels

# Evaluation Metrics
sil_score = silhouette_score(embeddings_pca, kmeans_labels)
db_score = davies_bouldin_score(embeddings_pca, kmeans_labels)

print("KMeans Results")
# The higher the silhoutte score, the better, while opposite is true for Davies-Boudin score.
print(f"Silhouette Score: {sil_score:.4f}")
print(f"Davies-Bouldin Score: {db_score:.4f}")

# Show distribution of clusters
print("\nCluster Distribution:")
print(df_sample['kmeans_cluster'].value_counts().sort_index())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure size
plt.figure(figsize=(18, 7))

# --- Plot 1: Cluster Distribution (Bar Chart) ---
plt.subplot(1, 2, 1)
# Calculate counts for sorted display
counts = df_sample['kmeans_cluster'].value_counts().sort_index()
sns.barplot(x=counts.index, y=counts.values, palette='viridis')
plt.title(f'Distribution of {num_clusters} Clusters')
plt.xlabel('Cluster ID')
plt.ylabel('Number of Tweets')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# --- Plot 2: Clusters on PCA Components (Scatter Plot) ---
# We take the first two dimensions of the PCA embeddings for visualization
plt.subplot(1, 2, 2)
sns.scatterplot(
    x=embeddings_pca[:, 0], 
    y=embeddings_pca[:, 1], 
    hue=kmeans_labels, 
    palette='tab20',  # 'tab20' has enough distinct colors for 17 clusters
    s=50, 
    alpha=0.6,
    legend='full'
)
plt.title('Clusters Visualized on First 2 PCA Components')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Cluster ID', ncol=1)

plt.tight_layout()
plt.show()

In [None]:
# Figuring out DBSCAN optimal eps value

from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np


k = 50 
nbrs = NearestNeighbors(n_neighbors=k).fit(embeddings_pca)
distances, indices = nbrs.kneighbors(embeddings_pca)

# We focus on the distance to the k-th nearest neighbor
distance_desc = sorted(distances[:, k-1], reverse=True)

plt.figure(figsize=(10, 6))
plt.plot(distance_desc)
plt.title('K-Distance Graph (Finding the Elbow for eps)')
plt.ylabel('Distance (eps)')
plt.xlabel('Points sorted by distance')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.show()

In [None]:
# DBSCAN Configuration 
EPS_VALUE = 0.8
MIN_SAMPLES = 20

print(f"Running DBSCAN (eps={EPS_VALUE}, min_samples={MIN_SAMPLES})...")
dbscan = DBSCAN(eps=EPS_VALUE, min_samples=MIN_SAMPLES)
dbscan_labels = dbscan.fit_predict(embeddings_pca)

# Add to dataframe
df_sample['dbscan_cluster'] = dbscan_labels

# Count clusters (excluding noise -1)
n_clusters_db = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)
noise_percent = (n_noise / len(df_sample)) * 100

print("DBSCAN Results")
print(f"Estimated number of clusters: {n_clusters_db}")
print(f"Noise points (Cluster -1): {n_noise} ({noise_percent:.1f}%)")