# **Step 1. Data Sources**

In [1]:
# ----------------------------------------------
# Step 1: Data Sources
# ----------------------------------------------
# This script performs the following:
# 1. Loads a manually downloaded Twitter dataset from Kaggle
# 2. Loads the dataset into a Pandas DataFrame
# 3. Extracts a subset of 4,000 tweets for analysis
# ----------------------------------------------

# ---- 1. Environment Setup ----
# Install necessary libraries

# If 'pandas' is not found, it installs it using 'pip' and then imports it.
# Pandas is essential for handling and analyzing structured data in tabular form,
# making it a key tool for data preprocessing, filtering, and manipulation.

try:
    import pandas as pd
except ImportError: # If any module is missing, handle the error
    import os
    os.system('pip install pandas')
    import pandas as pd

# ---- 2. Load the Dataset ----
import os

# Ensure the dataset exists
csv_path = "./Train.csv"  # Update this path if the file is located elsewhere

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"Dataset not found at {csv_path}. Ensure you extracted Tweets.csv.")

# Read the dataset into a Pandas DataFrame
df = pd.read_csv(csv_path, encoding="ISO-8859-1")

# Display basic information about the dataset
print("\n################################################################################################")
print("Step 1: Data Sources\n")

print("Dataset Loaded Successfully!")
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1], "\n")


print("Original Tweets dataset:")
print(df.head())  # Show the first few rows
print("\n")

# ---- 3. Extract a Subset of 4,000 Tweets ----
# Since the dataset is large, we take a random subset for efficient analysis computation.
# Keeping only the 'text' (tweet content) column for this study.

df_subset = df[['text', 'brand', 'emotion']].sample(n=4000, random_state=42)
print("New Tweets subset (only 'text' column selected):")
print(df_subset.head())  # Check the first few rows
print("\n################################################################################################")

FileNotFoundError: Dataset not found at ./Train.csv. Ensure you extracted Tweets.csv.

# **Step 2. Preprocessing**

In [None]:
# ----------------------------------------------
# Step 2: Preprocessing
# ----------------------------------------------
# This step prepares the text data for BERT-based topic modeling.
# Since BERT is a contextual model, we will:
# 1. Remove URLs, mentions (@user), and special characters
# 2. Convert emojis and contractions to meaningful words
# 3. Remove extra spaces and format text properly
# 4. Retain capitalization (BERT understands case-sensitive context)
# ----------------------------------------------

# ---- 1. Install Necessary Libraries ----
# Ensure required libraries are installed

try:
    import re  # Regular expressions for text cleaning (e.g., removing special characters, URLs)
    import emoji  # To handle and remove emojis from the text if necessary
    import contractions  # To expand shortened words (e.g., "can't" -> "cannot")
except ImportError:  # If any module is missing, handle the error
    os.system('pip install emoji contractions')  # Install missing libraries
    import re, emoji, contractions  # Re-import after installation to ensure availability

# ---- 2. Define Cleaning Functions ----
#we are analysing the data set for topic modeling, keeping mentions of usernames and hashtags is useful
# because they contain valuable context.

def clean_text(text):
    text = contractions.fix(text)  # Expand contractions
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = emoji.demojize(text)  # Convert emojis to text

    # Keep @mentions and #hashtags while removing other special characters
    text = re.sub(r'[^a-zA-Z0-9\s@#:_]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# ---- 3. Apply Preprocessing to Dataset ----
# Cleaning all tweets in the subset

df_subset['clean_text'] = df_subset['text'].apply(clean_text)

# ---- 4. Display Processed Data ----
print("Step 2: Preprocessing\n")

print("Preprocessing Complete!\n")

print("Original Tweet vs Cleaned Tweet:")
print(df_subset[['text', 'clean_text']].head())  # Compare before and after cleaning
print("\n################################################################################################")

# **Step 3. Exploratory Data Analysis (EDA)**

In [None]:
# ----------------------------------------------
# Step 3: Exploratory Data Analysis (EDA)
# ----------------------------------------------
# This step explores the dataset to gain insights before topic modeling.
# We will:
# 1. Analyze the distribution of tweet lengths
# 2. Generate a word cloud of the most common words
# 3. Identify and analyze hashtags
# 4. Identify and analyze mentions (@username)
# 5. Check for missing values or anomalies
# ----------------------------------------------

# ---- 1. Install Necessary Libraries ----
try:
    import matplotlib.pyplot as plt  # For visualizations
    import seaborn as sns  # For aesthetic statistical visualizations
    from collections import Counter  # For word frequency analysis
    from wordcloud import WordCloud  # For generating word clouds
except ImportError:
    import os
    os.system('pip install matplotlib seaborn wordcloud')
    import matplotlib.pyplot as plt
    import seaborn as sns
    from collections import Counter
    from wordcloud import WordCloud

# ---- 2. Check for Missing Values ----
print("Step 3: Exploratory Data Analysis (EDA)\n")

missing_values = df_subset.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)
print("\n")

# ---- 3. Analyze Tweet Length Distribution ----
df_subset['text_length'] = df_subset['clean_text'].apply(len)

# ---- 4. Identify Common Words (Word Frequency) ----
# Flatten all words into a single list
all_words = " ".join(df_subset['clean_text']).split()
word_counts = Counter(all_words)

# Get the 20 most common words
common_words = word_counts.most_common(20)

# Convert to DataFrame for visualization
df_common_words = pd.DataFrame(common_words, columns=["Word", "Count"])

# ---- 5. Hashtag Analysis ----
df_subset['hashtags'] = df_subset['clean_text'].apply(lambda x: re.findall(r'#\w+', x))
all_hashtags = [tag.lower() for sublist in df_subset['hashtags'] for tag in sublist]
hashtag_counts = Counter(all_hashtags).most_common(10)
df_hashtags = pd.DataFrame(hashtag_counts, columns=["Hashtag", "Count"])

# ---- 6. Mentions Analysis (@username) ----
df_subset['mentions'] = df_subset['clean_text'].apply(lambda x: re.findall(r'@\w+', x))
all_mentions = [mention.lower() for sublist in df_subset['mentions'] for mention in sublist]
mention_counts = Counter(all_mentions).most_common(10)
df_mentions = pd.DataFrame(mention_counts, columns=["Mention", "Count"])

# ---- 7. Create Subplots for EDA ----
fig, axes = plt.subplots(2, 2, figsize=(14, 12))  # 2 rows, 2 columns
fig.suptitle("Exploratory Data Analysis (EDA)", fontsize=16, fontweight='bold')  # Big title

# Plot 1: Distribution of Tweet Lengths
sns.histplot(df_subset['text_length'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title("Distribution of Tweet Lengths", fontsize=12)
axes[0, 0].set_xlabel("Tweet Length (Characters)")
axes[0, 0].set_ylabel("Frequency")

# Plot 2: Word Cloud of Most Common Words --> the most commom words are the largest in size in the cloud.
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(all_words))
axes[0, 1].imshow(wordcloud, interpolation='bilinear')
axes[0, 1].set_title("Word Cloud of Common Words", fontsize=12)
axes[0, 1].axis("off")

# Plot 3: Hashtag Analysis
sns.barplot(x="Count", y="Hashtag", data=df_hashtags, ax=axes[1, 0])
axes[1, 0].set_title("Top 10 Most Used Hashtags", fontsize=12)
axes[1, 0].set_xlabel("Count")
axes[1, 0].set_ylabel("Hashtag")

# Plot 4: Mentions Analysis (@username)
sns.barplot(x="Count", y="Mention", data=df_mentions, ax=axes[1, 1])
axes[1, 1].set_title("Top 10 Most Mentioned Users", fontsize=12)
axes[1, 1].set_xlabel("Count")
axes[1, 1].set_ylabel("Mention")

# Adjust layout for better spacing
# Adjust layout for better spacing
plt.tight_layout(rect=[0, 0, 1, 0.95], h_pad=4)  # Leaves space for big title and adds vertical padding
plt.show()

# ---- 8. Display Insights ----
print("Tweet Length Summary Statistics:")
print(df_subset['text_length'].describe())

print("\nMost Common Words:")
print(df_common_words)

print("\nMost Used Hashtags:")
print(df_hashtags)

print("\nMost Mentioned Users:")
print(df_mentions)

print("\n################################################################################################\n")

# **Step 4. Feature Transformation & Topic Modeling**

In [None]:
# Step 4: Feature Transformation & Topic Modeling
# --------------------------------------------------
# In this step, we convert text data into numerical features and apply topic modeling.
# We will:
# 1. Transform text using TF-IDF (Term Frequency-Inverse Document Frequency)
# 2. Apply clustering algorithms: K-Means, DBSCAN
# 3. Use LDA (Latent Dirichlet Allocation) for topic extraction
# --------------------------------------------------

# ---- 1. Install Necessary Libraries ----
try:
    from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text to numerical features
    from sklearn.cluster import KMeans, DBSCAN  # Clustering algorithms
    from sklearn.decomposition import LatentDirichletAllocation  # Topic modeling
    import numpy as np  # Numerical operations
    from textblob import TextBlob # Sentiment analysis
except ImportError:
    import os
    os.system('pip install scikit-learn numpy textblob')
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans, DBSCAN
    from sklearn.decomposition import LatentDirichletAllocation
    import numpy as np
    from textblob import TextBlob

print("Step 4: Feature Transformation & Topic Modeling\n")

# ---- 3. Transform Text Using TF-IDF ----
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df_subset['clean_text'])

# Convert to DataFrame for better readability
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF Transformation Completed. Shape:", df_tfidf.shape, "\n")

# ---- 4. Apply K-Means Clustering ----
n_clusters = 5  # Define number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_subset['cluster_kmeans'] = kmeans.fit_predict(X_tfidf)
print("K-Means Clustering Applied. Number of Clusters:", n_clusters, "\n")

# ---- 5. Apply DBSCAN Clustering ----
dbscan = DBSCAN(eps=0.5, min_samples=5)
df_subset['cluster_dbscan'] = dbscan.fit_predict(X_tfidf)
print("DBSCAN Clustering Applied.\n")

# ---- 6. Apply LDA for Topic Modeling ----
n_topics = 5  # Define number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_tfidf)

# Extract topics
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}:", [terms[i] for i in topic.argsort()[-10:]])

# ---- 7. Visualizing Cluster Distributions ----
# Create a column for the dominant LDA topic of each tweet
doc_topics = lda.transform(X_tfidf)
df_subset['dominant_topic'] = doc_topics.argmax(axis=1)

# ---- 7. Create a Single Window with 3 Subplots ----
fig, axes = plt.subplots(1, 3, figsize=(18, 6))  # 1 row, 3 columns for subplots

# Add a big title for the entire figure
fig.suptitle("Comparison of K-Means, DBSCAN, and LDA Distributions", fontsize=16)

# ---- Subplot 1: K-Means Distribution ----
sns.countplot(ax=axes[0], x='cluster_kmeans', data=df_subset)
axes[0].set_title("K-Means Cluster Distribution")
axes[0].set_xlabel("K-Means Clusters")
axes[0].set_ylabel("Tweet Count")

# ---- Subplot 2: DBSCAN Distribution ----
sns.countplot(ax=axes[1], x='cluster_dbscan', data=df_subset)
axes[1].set_title("DBSCAN Cluster Distribution")
axes[1].set_xlabel("DBSCAN Clusters")
axes[1].set_ylabel("Tweet Count")

# ---- Subplot 3: LDA Topic Distribution ----
sns.countplot(ax=axes[2], x='dominant_topic', data=df_subset)
axes[2].set_title("LDA Dominant Topic Distribution")
axes[2].set_xlabel("Dominant Topic")
axes[2].set_ylabel("Tweet Count")

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()

print("\n################################################################################################\n")

# **Step 5. BERT-Based Feature Transformation & Topic Modeling**

In [None]:
# Step 5: BERT-Based Feature Transformation & Topic Modeling
# --------------------------------------------------
# In this step, we leverage BERT embeddings for text representation and apply clustering.
# We will:
# 1. Convert text into BERT embeddings (768-dimensional vector representation)
# 2. Apply K-Means clustering to group similar tweets
# 3. Visualize the clusters using PCA for dimensionality reduction
# --------------------------------------------------

# ---- 1. Install Necessary Libraries ----
try:
    from transformers import BertTokenizer, BertModel  # Pre-trained BERT model & tokenizer
    import torch  # Deep learning framework for BERT
    from sklearn.decomposition import PCA  # Dimensionality reduction for visualization
except ImportError:
    import os
    os.system("pip install transformers torch scikit-learn numpy pandas matplotlib")
    from transformers import BertTokenizer, BertModel
    import torch
    from sklearn.decomposition import PCA

# ---- 2. Load Pre-trained BERT Model & Tokenizer ----
print("Step 5: BERT-Based Feature Transformation & Topic Modeling\n")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# ---- 3. Convert Text into BERT Embeddings ----
def get_bert_embedding(text):
    """Convert text into a 768-dimensional BERT embedding."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    with torch.no_grad():  # No gradients needed for inference
        outputs = model(**inputs)

    return outputs.last_hidden_state[:, 0, :].numpy().flatten()  # Extract sentence embedding

df_subset["bert_embedding"] = df_subset["clean_text"].apply(get_bert_embedding)

# Convert embeddings to NumPy array
X_bert = np.vstack(df_subset["bert_embedding"].values)
print("BERT Embeddings Generated. Shape:", X_bert.shape, "\n")  # Should be (num_samples, 768)

# ---- 4. Apply K-Means Clustering ----
n_clusters = 5  # Define number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_subset["bert_cluster"] = kmeans.fit_predict(X_bert)

print("K-Means Clustering Applied. Number of Clusters:", n_clusters, "\n")

# ---- 5. Visualize Clusters Using PCA ----
pca = PCA(n_components=2)  # Reduce dimensions from 768 to 2 for visualization
X_pca = pca.fit_transform(X_bert)

# Scatter plot of clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df_subset["bert_cluster"], cmap="viridis", alpha=0.6)
plt.title("BERT-Based Tweet Clusters")
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.colorbar(label="Cluster")
plt.show()

print("\n################################################################################################\n")

# **Step 6. Focus on iPhone Brand and Apply Topic Modeling & Clustering**

In [None]:
# Step 6: Focus on iPhone Brand and Apply Topic Modeling & Clustering
# --------------------------------------------------
# In this step, we will:
# 1. Filter tweets mentioning the iPhone brand (or related hashtags).
# 2. Apply TF-IDF transformation to the filtered tweets.
# 3. Apply clustering algorithms (K-Means, DBSCAN).
# 4. Perform topic modeling with LDA to extract topics related to iPhone.
# 5. Optionally, apply sentiment analysis to the iPhone-related tweets.

# ---- 1. Filter Tweets Mentioning iPhone ----
iphone_keywords = ['#iPhone', 'iPhone', 'iphone', 'iPhone6', 'iPhoneX', 'iPhone13', 'iPhone12', 'iPhone14', '#Apple']
# Filter the dataset for tweets containing "iPhone" or related hashtags
iphone_df = df_subset[df_subset['clean_text'].str.contains('|'.join(iphone_keywords), case=False, na=False)]

# Display the number of iPhone-related tweets
print(f"Number of iPhone-related tweets: {iphone_df.shape[0]}\n")

# ---- 2. Apply TF-IDF Transformation ----
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf_iphone = vectorizer.fit_transform(iphone_df['clean_text'])

# Convert TF-IDF to a DataFrame for better readability
df_tfidf_iphone = pd.DataFrame(X_tfidf_iphone.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF Transformation Completed for iPhone-related tweets. Shape:", df_tfidf_iphone.shape, "\n")

# ---- 3. Apply K-Means Clustering to iPhone Tweets ----
n_clusters_iphone = 5  # Define number of clusters
kmeans_iphone = KMeans(n_clusters=n_clusters_iphone, random_state=42, n_init=10)
iphone_df['cluster_kmeans'] = kmeans_iphone.fit_predict(X_tfidf_iphone)
print("K-Means Clustering Applied on iPhone tweets. Number of Clusters:", n_clusters_iphone, "\n")

# ---- 4. Apply DBSCAN Clustering to iPhone Tweets ----
dbscan_iphone = DBSCAN(eps=0.5, min_samples=5)
iphone_df['cluster_dbscan'] = dbscan_iphone.fit_predict(X_tfidf_iphone)
print("DBSCAN Clustering Applied on iPhone tweets.\n")

# ---- 5. Apply LDA for Topic Modeling on iPhone Tweets ----
n_topics_iphone = 5  # Define number of topics for iPhone-related tweets
lda_iphone = LatentDirichletAllocation(n_components=n_topics_iphone, random_state=42)
lda_iphone.fit(X_tfidf_iphone)

# Extract and display topics for iPhone-related tweets
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda_iphone.components_):
    print(f"Topic {i+1} (iPhone-related):", [terms[i] for i in topic.argsort()[-10:]])

# ---- 6. Apply Sentiment Analysis on iPhone Tweets ----

# Function to analyze sentiment
def get_sentiment(text):
    blob = TextBlob(text)
    # Polarity ranges from -1 (negative) to 1 (positive)
    return blob.sentiment.polarity

# Apply sentiment analysis to the iPhone-related tweets
iphone_df['sentiment'] = iphone_df['clean_text'].apply(get_sentiment)

# Display sentiment distribution for iPhone-related tweets
print("\nSentiment distribution for iPhone-related tweets:")
print(iphone_df['sentiment'].describe())

# ---- 7. Visualizing K-Means Clusters ----
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize K-Means clusters for iPhone-related tweets
sns.countplot(x='cluster_kmeans', data=iphone_df)
plt.title("Cluster Distribution of iPhone-related Tweets (K-Means)")
plt.xlabel("Cluster Number")
plt.ylabel("Tweet Count")
plt.show()

# ---- 8. Visualizing Sentiment Distribution for iPhone Tweets ----
sns.histplot(iphone_df['sentiment'], kde=True)
plt.title("Sentiment Distribution of iPhone-related Tweets")
plt.xlabel("Sentiment Score")
plt.ylabel("Tweet Count")
plt.show()

print("\n################################################################################################\n")