In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Machine Learning & Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.manifold import TSNE

# NLP & GenAI
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Configuration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option('display.max_colwidth', 150)

# Select Device (GPU if available, else CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load Data
file_path = '../dataset/Tweets.csv'

df = pd.read_csv(file_path)

print(f"Dataset Successfully Loaded: {df.shape[0]} rows.")

# Cleaning & Preprocessing 
def clean_tweet(text):
    if not isinstance(text, str): return ""
    # 1. Change to lowercase
    text = text.lower()
    # 2. Remove mentions (for e.g, @United)
    text = " ".join([word for word in text.split() if not word.startswith('@')])
    # 3. Remove whitespace
    return text.strip()

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Filter out empty tweets after cleaning
df = df[df['cleaned_text'].str.len() > 5].reset_index(drop=True)

df_sample = df.copy()

print(f"Data Cleaned & Ready. Working with {len(df_sample)} tweets.")
display(df_sample[['airline_sentiment', 'cleaned_text']].head())

Dataset Successfully Loaded: 14640 rows.
Data Cleaned & Ready. Working with 14601 tweets.


Unnamed: 0,airline_sentiment,cleaned_text
0,neutral,what said.
1,positive,plus you've added commercials to the experience... tacky.
2,neutral,i didn't today... must mean i need to take another trip!
3,negative,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,negative,and it's a really big bad thing about it
