# 🎯 Social Media Sentiment Analysis - 45K Training
## Working with Real Sentiment140 Dataset (1.6M → 45K)

This notebook will:
1. Load 45,000 tweets from the full 1.6M dataset
2. Clean and process the data
3. Prepare for machine learning training

In [None]:
# Cell 1: Setup and imports
import sys
sys.path.append('..')

from src.data.data_collector import DataCollector
from src.data.data_cleaner import DataCleaner
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

print("🚀 Social Media Sentiment Analysis - 45K Training")
print("✅ All imports successful")
print("📊 Ready to process real Twitter data!")

In [None]:
# Cell 2: Check dataset and get info
collector = DataCollector()

print("📁 Checking for manually downloaded dataset...")
dataset_info = collector.get_dataset_info()

if dataset_info:
    print("\n✅ Dataset ready for processing!")
else:
    print("\n❌ Please download and place the dataset first")
    print("📥 Download from: https://www.kaggle.com/datasets/kazanova/sentiment140")
    print("📁 Place at: data/raw/sentiment140.csv")

In [None]:
# Cell 3: Load 45,000 balanced samples
print("📊 Loading 45,000 tweets from 1.6M dataset...")
print("⏳ This may take a moment to read the large file...")

# Load our target sample size
data = collector.load_data(sample_size=45000)


# data = collector.load_data(sample_size=1600000)

if data is not None:
    print(f"\n🎉 Success! Loaded {len(data):,} tweets")
    
    # Show basic info
    print(f"\n📊 Dataset Overview:")
    print(f"📏 Shape: {data.shape}")
    print(f"📋 Columns: {list(data.columns)}")
    print(f"💾 Memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.1f}MB")
    
    # Check for missing values
    print(f"\n🔍 Data Quality Check:")
    missing = data.isnull().sum()
    for col, miss_count in missing.items():
        if miss_count > 0:
            print(f"   ⚠️ {col}: {miss_count} missing values")
        else:
            print(f"   ✅ {col}: No missing values")
else:
    print("❌ Failed to load data. Check your dataset file.")

In [None]:
# Cell 4: Explore sample tweets
print("Sample tweets from our 45K dataset:\n")

# Show a mix of positive and negative tweets
negative_tweets = data[data['sentiment'] == 0].head(3)
positive_tweets = data[data['sentiment'] == 4].head(3)

print("😢 NEGATIVE TWEETS:")
for i, (_, tweet) in enumerate(negative_tweets.iterrows(), 1):
    print(f"{i}. {tweet['text'][:120]}..." if len(tweet['text']) > 120 else f"{i}. {tweet['text']}")
    print()

print("😊 POSITIVE TWEETS:")
for i, (_, tweet) in enumerate(positive_tweets.iterrows(), 1):
    print(f"{i}. {tweet['text'][:120]}..." if len(tweet['text']) > 120 else f"{i}. {tweet['text']}")
    print()

# Analyze text lengths
text_lengths = data['text'].str.len()
print(f"📏 Text Length Statistics:")
print(f"   Average: {text_lengths.mean():.1f} characters")
print(f"   Shortest: {text_lengths.min()} characters")
print(f"   Longest: {text_lengths.max()} characters")

In [None]:
# Cell 5: Clean the 45K tweets
print("🧹 Starting cleaning process for 45,000 tweets...")
print("⏳ This will take about 2-3 minutes with progress tracking")

cleaner = DataCleaner()
cleaned_data = cleaner.clean_dataset(data)

if len(cleaned_data) > 0:
    print(f"\n🎉 Cleaning Success!")
    
    # Compare before and after
    print(f"\n📊 Cleaning Results:")
    print(f"   Original tweets: {len(data):,}")
    print(f"   After cleaning: {len(cleaned_data):,}")
    print(f"   Removal rate: {((len(data) - len(cleaned_data)) / len(data) * 100):.1f}%")
    
    # Text length comparison
    original_avg = data['text'].str.len().mean()
    cleaned_avg = cleaned_data['cleaned_text'].str.len().mean()
    print(f"\n📏 Text Length Reduction:")
    print(f"   Original average: {original_avg:.1f} characters")
    print(f"   Cleaned average: {cleaned_avg:.1f} characters")
    print(f"   Reduction: {((original_avg - cleaned_avg) / original_avg * 100):.1f}%")
else:
    print("❌ Cleaning failed")

In [None]:
# Cell 6: Before/After examples
print("🔍 Before vs After Cleaning Examples:\n")

# Show 5 examples of cleaning
for i in range(5):
    original = data.iloc[i]['text']
    cleaned = cleaned_data.iloc[i]['cleaned_text']
    sentiment = cleaned_data.iloc[i]['sentiment_label']
    
    print(f"--- Example {i+1} [{sentiment.upper()}] ---")
    print(f"BEFORE: {original}")
    print(f"AFTER:  {cleaned}")
    print(f"LENGTH: {len(original)} → {len(cleaned)} characters\n")

# Word cloud of most common words
print("📊 Most Common Words After Cleaning:")
all_words = ' '.join(cleaned_data['cleaned_text']).split()
word_freq = pd.Series(all_words).value_counts().head(20)
print(word_freq)

In [None]:
# Cell 7: Data visualization
print("📊 Creating visualizations for our 45K dataset...\n")

# Create comprehensive plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# 1. Sentiment distribution pie chart
sentiment_counts = cleaned_data['sentiment_label'].value_counts()
colors = ['#ff6b6b', '#4ecdc4']  # Red for negative, teal for positive
ax1.pie(sentiment_counts.values, labels=['😢 Negative', '😊 Positive'], 
        autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title(f'Sentiment Distribution\n({len(cleaned_data):,} tweets)', fontsize=12, fontweight='bold')

# 2. Text length distribution
text_lengths = cleaned_data['cleaned_text'].str.len()
ax2.hist(text_lengths, bins=30, color='skyblue', alpha=0.7, edgecolor='black')
ax2.set_title('Distribution of Tweet Lengths (After Cleaning)', fontweight='bold')
ax2.set_xlabel('Number of Characters')
ax2.set_ylabel('Frequency')
ax2.axvline(text_lengths.mean(), color='red', linestyle='--', 
           label=f'Average: {text_lengths.mean():.1f}')
ax2.legend()

# 3. Top 15 words bar chart
top_words = word_freq.head(15)
ax3.barh(range(len(top_words)), top_words.values, color='lightcoral')
ax3.set_yticks(range(len(top_words)))
ax3.set_yticklabels(top_words.index)
ax3.set_title('Top 15 Most Common Words', fontweight='bold')
ax3.set_xlabel('Frequency')

# 4. Processing statistics
stats_data = {
    'Original Dataset': len(data),
    'After Cleaning': len(cleaned_data),
    'Negative Tweets': len(cleaned_data[cleaned_data['sentiment_label'] == 'negative']),
    'Positive Tweets': len(cleaned_data[cleaned_data['sentiment_label'] == 'positive'])
}
bars = ax4.bar(stats_data.keys(), stats_data.values(), 
               color=['lightblue', 'lightgreen', 'lightcoral', 'lightpink'])
ax4.set_title('Dataset Processing Statistics', fontweight='bold')
ax4.set_ylabel('Number of Tweets')
ax4.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 500,
             f'{int(height):,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("✅ Visualizations complete!")

In [None]:
# Cell 8: Save processed data
print("💾 Saving processed 45K dataset...")

# Save the cleaned data
cleaner.save_data(cleaned_data)

# Create a summary report
summary = {
    'processing_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'original_tweets': len(data),
    'cleaned_tweets': len(cleaned_data),
    'removal_rate': f"{((len(data) - len(cleaned_data)) / len(data) * 100):.1f}%",
    'negative_tweets': len(cleaned_data[cleaned_data['sentiment_label'] == 'negative']),
    'positive_tweets': len(cleaned_data[cleaned_data['sentiment_label'] == 'positive']),
    'avg_tweet_length': f"{cleaned_data['cleaned_text'].str.len().mean():.1f}",
    'memory_usage_mb': f"{cleaned_data.memory_usage(deep=True).sum() / 1024**2:.1f}"
}

# Save summary
summary_df = pd.DataFrame([summary])
summary_df.to_csv('../data/processed/processing_summary.csv', index=False)

print(f"\n🎉 Step 1 Complete!")
print(f"\n📊 Final Summary:")
for key, value in summary.items():
    print(f"   {key.replace('_', ' ').title()}: {value}")

print(f"\n🚀 Ready for Step 2: Feature Engineering!")
print(f"\n✅ What we accomplished:")
print(f"   📥 Loaded 45,000 tweets from 1.6M dataset")
print(f"   🧹 Cleaned and processed all text")
print(f"   ⚖️ Maintained balanced positive/negative ratio")
print(f"   💾 Saved ready-to-use training data")
print(f"   📊 Generated comprehensive analysis")

print(f"\n🎯 Next Step: Convert text to numerical features for ML training!")