# 🎯 Social Media Sentiment Analysis - 45K Training
## Working with Real Sentiment140 Dataset (1.6M → 100K)

This notebook will:
1. Load 100,000 tweets from the full 1.6M dataset
2. Clean and process the data
3. Prepare for machine learning training

In [2]:
#1: Setup and imports
import sys
sys.path.append('..')

from src.data.data_collector import DataCollector
from src.data.data_cleaner import DataCleaner
import pandas as pd
import matplotlib.pyplot as plt

print(" Social Media Sentiment Analysis - 100K Training")
print(" All imports successful")

 Social Media Sentiment Analysis - 100K Training
 All imports successful


In [None]:
#2: Load 100K tweets
collector = DataCollector()
data = collector.load_data()

if data is not None:
    print(f"\n Dataset loaded successfully!")
    print(f"Shape: {data.shape}")
    print(f"Memory: {data.memory_usage(deep=True).sum() / 1024**2:.1f}MB")
    
    # Show sample tweets
    print(f"\n Sample tweets:")
    for i in range(3):
        sentiment = "😢 NEGATIVE" if data.iloc[i]['sentiment'] == 0 else "😊 POSITIVE"
        print(f"{i+1}. {sentiment}: {data.iloc[i]['text'][:100]}...")

In [None]:
#3: Complete cleaning pipeline
print(" Starting complete data cleaning pipeline...")

cleaner = DataCleaner()
cleaned_data = cleaner.clean_dataset(data)

In [None]:
#4: Show results
if len(cleaned_data) > 0:
    print(f"\n Pipeline complete!")
    
    # Before/after examples
    print(f"\n Before vs After cleaning:")
    for i in range(3):
        print(f"\n{i+1}. ORIGINAL: {data.iloc[i]['text']}")
        print(f"   CLEANED:  {cleaned_data.iloc[i]['cleaned_text']}")
        print(f"   LABEL:    {cleaned_data.iloc[i]['sentiment_label']}")
    
    # Statistics
    original_avg = data['text'].str.len().mean()
    cleaned_avg = cleaned_data['cleaned_text'].str.len().mean()
    print(f"\n Text length: {original_avg:.1f} → {cleaned_avg:.1f} chars")
    print(f" Size reduction: {((original_avg-cleaned_avg)/original_avg*100):.1f}%")

In [None]:
#5: Visualize results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Sentiment distribution
sentiment_counts = cleaned_data['sentiment_label'].value_counts()
ax1.pie(sentiment_counts.values, labels=['😢 Negative', '😊 Positive'], 
        autopct='%1.1f%%', startangle=90)
ax1.set_title(f'Sentiment Distribution\n({len(cleaned_data):,} tweets)')

# Text length distribution
lengths = cleaned_data['cleaned_text'].str.len()
ax2.hist(lengths, bins=30, alpha=0.7, color='skyblue')
ax2.set_title('Tweet Length Distribution')
ax2.set_xlabel('Characters')
ax2.set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Cell 6: Save processed data
cleaner.save_data(cleaned_data)
