# Twitter Sentiment Analysis - Data Preprocessing

This notebook handles comprehensive data preprocessing for the Twitter sentiment analysis project.

## Objectives
1. Load and clean the dataset
2. Handle Twitter-specific elements (hashtags, mentions, URLs, emojis)
3. Implement text preprocessing (tokenization, lemmatization, stopword removal)
4. Address class imbalance using SMOTE and weighted sampling
5. Create feature extraction pipelines:
   - TF-IDF vectorization
   - Word2Vec embeddings
   - GloVe embeddings
6. Save processed data for model training

## Preprocessing Pipeline
1. **Text Cleaning**: Remove noise, normalize text
2. **Twitter Elements**: Handle hashtags, mentions, URLs, emojis
3. **Tokenization**: Split text into tokens
4. **Normalization**: Lemmatization, stemming
5. **Feature Extraction**: Create multiple feature representations
6. **Class Balancing**: Address imbalanced dataset


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import warnings
from collections import Counter
import pickle
import os

# Text processing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from textblob import TextBlob

# Feature extraction libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

# Word embeddings
import gensim
from gensim.models import Word2Vec
import requests
import zipfile

# Set up
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

# Download required NLTK data
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    print("NLTK data downloaded successfully!")
except Exception as e:
    print(f"NLTK download issue: {e}")

# Small-run flag for laptops with limited RAM (16GB RAM, i5 11th gen CPU)
# Using 20k tweets for fast training while maintaining good results
SMALL_RUN = True
SAMPLE_SIZE_TO_USE = 20000  # Optimized for laptop performance

# Ensure directories exist
os.makedirs('../models/saved_models', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../reports/figures', exist_ok=True)


Libraries imported successfully!
Current working directory: c:\Users\acer\Desktop\ml_proj\notebooks
NLTK data downloaded successfully!


In [2]:
# Load the dataset
print("Loading Sentiment140 dataset...")

# Define column names
columns = ['sentiment', 'tweet_id', 'date', 'query', 'username', 'tweet_text']

# Load dataset with a small sample for fast processing
# IMPORTANT: The dataset is ordered (negatives first, then positives)
# We need to sample from both sections to get balanced classes
print(f"Loading {SAMPLE_SIZE_TO_USE:,} tweets for preprocessing...")
print("Note: Sampling from both negative and positive sections to ensure balanced classes...")

# Strategy: Load samples from both halves of the dataset
# Sentiment140 has ~800k negatives (sentiment=0) followed by ~800k positives (sentiment=4)
half_sample = SAMPLE_SIZE_TO_USE // 2

# Load from negative section (sample from first 400k rows to save memory)
print("Loading negative samples from first half...")
df_neg_all = pd.read_csv('../sentiment140.csv', header=None, names=columns, encoding='latin-1', 
                         nrows=400000)
df_neg = df_neg_all[df_neg_all['sentiment'] == 0].sample(n=half_sample, random_state=42).reset_index(drop=True)
del df_neg_all

# Load from positive section (sample from positive region, starting at row 800k)
print("Loading positive samples from second half...")
df_pos_all = pd.read_csv('../sentiment140.csv', header=None, names=columns, encoding='latin-1', 
                         skiprows=800000, nrows=400000)
df_pos = df_pos_all[df_pos_all['sentiment'] == 4].sample(n=half_sample, random_state=42).reset_index(drop=True)
del df_pos_all

# Combine both and shuffle
df = pd.concat([df_neg, df_pos], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
del df_neg, df_pos  # Free memory

print(f"Dataset loaded: {df.shape}")

# Convert sentiment labels
sentiment_mapping = {0: 'Negative', 4: 'Positive'}
df['sentiment_label'] = df['sentiment'].map(sentiment_mapping)

# Create binary labels for modeling
df['sentiment_binary'] = (df['sentiment'] == 4).astype(int)

print(f"\nDataset shape: {df.shape}")
print(f"Sentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"Binary labels distribution: {df['sentiment_binary'].value_counts().to_dict()}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


Loading Sentiment140 dataset...
Loading 20,000 tweets for preprocessing...
Note: Sampling from both negative and positive sections to ensure balanced classes...
Loading negative samples from first half...
Loading positive samples from second half...
Loading positive samples from second half...
Dataset loaded: (20000, 6)

Dataset shape: (20000, 8)
Sentiment distribution:
sentiment_label
Positive    10000
Negative    10000
Name: count, dtype: int64
Binary labels distribution: {1: 10000, 0: 10000}
Memory usage: 7.57 MB
Dataset loaded: (20000, 6)

Dataset shape: (20000, 8)
Sentiment distribution:
sentiment_label
Positive    10000
Negative    10000
Name: count, dtype: int64
Binary labels distribution: {1: 10000, 0: 10000}
Memory usage: 7.57 MB


In [3]:
# Text preprocessing functions
class TwitterTextPreprocessor:
    """
    Comprehensive text preprocessing for Twitter data
    """
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.tweet_tokenizer = TweetTokenizer()
        
        # Twitter-specific patterns
        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.mention_pattern = re.compile(r'@\\w+')
        self.hashtag_pattern = re.compile(r'#\\w+')
        self.emoji_pattern = re.compile(r'[\\U0001F600-\\U0001F64F\\U0001F300-\\U0001F5FF\\U0001F680-\\U0001F6FF\\U0001F1E0-\\U0001F1FF]')
        
    def clean_text(self, text):
        """Basic text cleaning"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = self.url_pattern.sub('URL', text)
        
        # Remove mentions
        text = self.mention_pattern.sub('MENTION', text)
        
        # Handle hashtags (keep the word, remove #)
        text = self.hashtag_pattern.sub(lambda m: m.group(0)[1:], text)
        
        # Remove emojis
        text = self.emoji_pattern.sub('', text)
        
        # Remove extra whitespace
        text = re.sub(r'\\s+', ' ', text)
        
        # Remove punctuation except for some important ones
        text = re.sub(r'[^a-zA-Z0-9\\s]', ' ', text)
        
        return text.strip()
    
    def tokenize_text(self, text):
        """Tokenize text using TweetTokenizer"""
        return self.tweet_tokenizer.tokenize(text)
    
    def remove_stopwords(self, tokens):
        """Remove stopwords from tokens"""
        return [token for token in tokens if token not in self.stop_words]
    
    def lemmatize_tokens(self, tokens):
        """Lemmatize tokens"""
        return [self.lemmatizer.lemmatize(token) for token in tokens]
    
    def stem_tokens(self, tokens):
        """Stem tokens"""
        return [self.stemmer.stem(token) for token in tokens]
    
    def preprocess_pipeline(self, text, use_lemmatization=True, use_stemming=False):
        """Complete preprocessing pipeline"""
        # Clean text
        cleaned_text = self.clean_text(text)
        
        # Tokenize
        tokens = self.tokenize_text(cleaned_text)
        
        # Remove stopwords
        tokens = self.remove_stopwords(tokens)
        
        # Apply lemmatization or stemming
        if use_lemmatization:
            tokens = self.lemmatize_tokens(tokens)
        elif use_stemming:
            tokens = self.stem_tokens(tokens)
        
        # Filter out empty tokens
        tokens = [token for token in tokens if len(token) > 1]
        
        return ' '.join(tokens)

# Initialize preprocessor
preprocessor = TwitterTextPreprocessor()
print("Text preprocessor initialized successfully!")


Text preprocessor initialized successfully!


In [4]:
# Apply text preprocessing
print("Applying text preprocessing...")
print(f"Processing {len(df):,} tweets...")

# Use the full dataset (already limited to SAMPLE_SIZE_TO_USE)
df_sample = df.copy()

# Apply preprocessing
print("Preprocessing tweets...")
df_sample['cleaned_text'] = df_sample['tweet_text'].apply(
    lambda x: preprocessor.preprocess_pipeline(x, use_lemmatization=True)
)

# Remove empty tweets after preprocessing
initial_count = len(df_sample)
df_sample = df_sample[df_sample['cleaned_text'].str.strip() != '']
final_count = len(df_sample)

print(f"Preprocessing completed!")
print(f"Tweets before: {initial_count:,}")
print(f"Tweets after: {final_count:,}")
print(f"Removed empty tweets: {initial_count - final_count:,}")

# Show sample of preprocessed text
print("\nSample of preprocessed tweets:")
for i in range(3):
    print(f"Original: {df_sample.iloc[i]['tweet_text']}")
    print(f"Cleaned:  {df_sample.iloc[i]['cleaned_text']}")
    print("-" * 50)


Applying text preprocessing...
Processing 20,000 tweets...
Preprocessing tweets...
Preprocessing completed!
Tweets before: 20,000
Tweets after: 19,990
Removed empty tweets: 10

Sample of preprocessed tweets:
Original: @MitchBenn Take a photo, upload to Twitpic, ask Twitter to identify him. 
Cleaned:  mitchbenn take photo upload twitpic ask twitter identify
--------------------------------------------------
Original: Jacked up morning already. I'm in the emergency room. 
Cleaned:  jacked morning already emergency room
--------------------------------------------------
Original: I can't wait for summer to come! I  had snow on my car this morning 
Cleaned:  wait summer come snow car morning
--------------------------------------------------
Preprocessing completed!
Tweets before: 20,000
Tweets after: 19,990
Removed empty tweets: 10

Sample of preprocessed tweets:
Original: @MitchBenn Take a photo, upload to Twitpic, ask Twitter to identify him. 
Cleaned:  mitchbenn take photo upload twitp

In [5]:
# Feature extraction - TF-IDF
print("=== TF-IDF FEATURE EXTRACTION ===")

# Initialize TF-IDF vectorizer (optimized for smaller dataset)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,   # Reduced vocabulary size for faster processing
    ngram_range=(1, 2),  # Use unigrams and bigrams
    min_df=3,            # Ignore terms that appear in less than 3 documents
    max_df=0.95,         # Ignore terms that appear in more than 95% of documents
    stop_words='english',
    lowercase=True,
    strip_accents='unicode'
)

# Fit and transform the text data
print("Fitting TF-IDF vectorizer...")
X_tfidf = tfidf_vectorizer.fit_transform(df_sample['cleaned_text'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Sample feature names: {feature_names[:10]}")

# Save TF-IDF vectorizer (ensure directory exists)
os.makedirs(os.path.dirname('../models/saved_models/tfidf_vectorizer.pkl'), exist_ok=True)
with open('../models/saved_models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print("TF-IDF vectorizer saved to models/saved_models/tfidf_vectorizer.pkl")


=== TF-IDF FEATURE EXTRACTION ===
Fitting TF-IDF vectorizer...
TF-IDF matrix shape: (19990, 5000)
Vocabulary size: 5000
Sample feature names: ['aaah' 'aaron' 'ab' 'abandoned' 'abc' 'ability' 'able' 'able make'
 'able sleep' 'able talk']
TF-IDF vectorizer saved to models/saved_models/tfidf_vectorizer.pkl


In [6]:
# Word2Vec embeddings
print("=== WORD2VEC EMBEDDINGS ===")

# Prepare text data for Word2Vec (list of token lists)
print("Preparing text data for Word2Vec...")
texts_for_w2v = [text.split() for text in df_sample['cleaned_text']]

# Train Word2Vec model (optimized for faster training)
print("Training Word2Vec model...")
w2v_model = Word2Vec(
    sentences=texts_for_w2v,
    vector_size=100,      # Embedding dimension
    window=5,             # Context window size
    min_count=3,          # Reduced minimum word frequency for smaller dataset
    workers=2,            # Reduced workers for CPU efficiency
    sg=0,                 # Use CBOW (0) or Skip-gram (1)
    epochs=5              # Reduced epochs for faster training
)

print(f"Word2Vec model trained!")
print(f"Vocabulary size: {len(w2v_model.wv)}")
print(f"Vector dimension: {w2v_model.vector_size}")

# Save Word2Vec model
w2v_model.save('../models/saved_models/word2vec_model.model')
print("Word2Vec model saved to models/saved_models/word2vec_model.model")

# Create sentence embeddings by averaging word vectors
def get_sentence_embedding(text, model):
    """Get sentence embedding by averaging word vectors"""
    words = text.split()
    vectors = []
    for word in words:
        if word in model.wv:
            vectors.append(model.wv[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

print("Creating sentence embeddings...")
X_w2v = np.array([get_sentence_embedding(text, w2v_model) for text in df_sample['cleaned_text']])
print(f"Word2Vec embeddings shape: {X_w2v.shape}")


=== WORD2VEC EMBEDDINGS ===
Preparing text data for Word2Vec...
Training Word2Vec model...
Word2Vec model trained!
Vocabulary size: 5314
Vector dimension: 100
Word2Vec model saved to models/saved_models/word2vec_model.model
Creating sentence embeddings...
Word2Vec embeddings shape: (19990, 100)


In [7]:
# Class imbalance handling (do NOT apply SMOTE to sparse TF-IDF matrices — memory issue)
print("=== CLASS IMBALANCE HANDLING ===")

y = df_sample['sentiment_binary']
class_counts = y.value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"Imbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio > 1.2:
    print("\nApplying SMOTE only to dense features (Word2Vec). TF-IDF kept as-is to avoid memory blowup.")
    # Ensure X_w2v is dense (it is built as dense above)
    smote = SMOTE(random_state=42)
    try:
        X_w2v_balanced, y_w2v_balanced = smote.fit_resample(X_w2v, y)
        X_w2v = X_w2v_balanced
        y = y_w2v_balanced
        print(f"Word2Vec shape after SMOTE: {X_w2v.shape}")
        print(f"Balanced class distribution: {np.bincount(y)}")
    except Exception as e:
        print(f"SMOTE on Word2Vec failed: {e}. Proceeding without SMOTE.")
else:
    print("Dataset is well-balanced or SMOTE not applied. TF-IDF left unchanged.")

# ...existing code...

=== CLASS IMBALANCE HANDLING ===
Imbalance ratio: 1.00
Dataset is well-balanced or SMOTE not applied. TF-IDF left unchanged.


In [8]:
# Train-test split
print("=== TRAIN-TEST SPLIT ===")

# Split the data
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42, stratify=y
)

print(f"TF-IDF Train shape: {X_tfidf_train.shape}")
print(f"TF-IDF Test shape: {X_tfidf_test.shape}")
print(f"Word2Vec Train shape: {X_w2v_train.shape}")
print(f"Word2Vec Test shape: {X_w2v_test.shape}")

print(f"\\nTrain labels distribution: {np.bincount(y_train)}")
print(f"Test labels distribution: {np.bincount(y_test)}")

# Save processed data (ensure directories exist)
os.makedirs('../data/processed', exist_ok=True)

# Convert sparse matrices to dense arrays for saving (memory-efficient conversion)
print("Converting sparse matrices to dense arrays...")
if hasattr(X_tfidf_train, 'toarray'):
    X_tfidf_train_dense = X_tfidf_train.toarray()
    X_tfidf_test_dense = X_tfidf_test.toarray()
else:
    X_tfidf_train_dense = X_tfidf_train
    X_tfidf_test_dense = X_tfidf_test

np.save('../data/processed/X_tfidf_train.npy', X_tfidf_train_dense)
np.save('../data/processed/X_tfidf_test.npy', X_tfidf_test_dense)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)

np.save('../data/processed/X_w2v_train.npy', X_w2v_train)
np.save('../data/processed/X_w2v_test.npy', X_w2v_test)

# Save text data for deep learning models
text_train, text_test, _, _ = train_test_split(
    df_sample['cleaned_text'], y, test_size=0.2, random_state=42, stratify=y
)

with open('../data/processed/text_train.pkl', 'wb') as f:
    pickle.dump(text_train.tolist(), f)

with open('../data/processed/text_test.pkl', 'wb') as f:
    pickle.dump(text_test.tolist(), f)

print("All processed data saved successfully!")
print("\\n✅ Preprocessing completed!")
print("Ready for model training in the next notebooks.")


=== TRAIN-TEST SPLIT ===
TF-IDF Train shape: (15992, 5000)
TF-IDF Test shape: (3998, 5000)
Word2Vec Train shape: (15992, 100)
Word2Vec Test shape: (3998, 100)
\nTrain labels distribution: [7996 7996]
Test labels distribution: [1999 1999]
Converting sparse matrices to dense arrays...
All processed data saved successfully!
\n✅ Preprocessing completed!
Ready for model training in the next notebooks.
