In [None]:
# ===================================
# PRODUCTION DATASET OPTIONS
# ===================================

# OPTION 1: Twitter Sentiment140 Dataset (1.6M tweets)
# Download from: https://www.kaggle.com/datasets/kazanova/sentiment140
# Uncomment below to use:
"""
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='latin-1',
                 names=['sentiment', 'id', 'date', 'query', 'user', 'text'])
# Map: 0=negative, 4=positive -> 0=negative, 2=positive
df['sentiment'] = df['sentiment'].map({0: 0, 4: 2})
# For 3-class: sample neutral tweets or use a different dataset
"""

# OPTION 2: IMDB Movie Reviews (50K reviews - binary sentiment)
# Install: pip install datasets
"""
from datasets import load_dataset
dataset = load_dataset('imdb')
df = pd.DataFrame({
    'text': dataset['train']['text'],
    'sentiment': [2 if label == 1 else 0 for label in dataset['train']['label']]
})
"""

# OPTION 3: Amazon Reviews (Multi-domain)
# Download from: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
"""
df = pd.read_csv('amazon_reviews.csv')
# Map star ratings: 1-2 stars=0 (negative), 3=1 (neutral), 4-5=2 (positive)
df['sentiment'] = df['rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))
"""

# OPTION 4: Yelp Reviews (for restaurant/business sentiment)
"""
from datasets import load_dataset
dataset = load_dataset('yelp_review_full')
df = pd.DataFrame({
    'text': dataset['train']['text'],
    'sentiment': dataset['train']['label']  # 0-4 stars
})
# Map to 3 classes: 0-1=negative(0), 2=neutral(1), 3-4=positive(2)
df['sentiment'] = df['sentiment'].apply(lambda x: 0 if x <= 1 else (1 if x == 2 else 2))
"""

# OPTION 5: Financial News Sentiment
"""
from datasets import load_dataset
dataset = load_dataset('financial_phrasebank', 'sentences_allagree')
df = pd.DataFrame({
    'text': dataset['train']['sentence'],
    'sentiment': dataset['train']['label']  # 0=negative, 1=neutral, 2=positive
})
"""

# ===================================
# TEMPORARY: Larger Synthetic Dataset for Demo
# (Replace with real data above for production)
# ===================================
import numpy as np

# Larger diverse dataset (300+ samples)
positive_texts = [
    "I love this product, it's amazing!", "This is the best experience ever",
    "Absolutely wonderful and fantastic", "Great quality, highly recommend",
    "Super happy with my purchase", "Excellent service and product",
    "Fantastic! Exceeded all my expectations", "Best decision I ever made",
    "Love it! Will buy again", "Amazing quality, very satisfied",
    "Outstanding product, worth every penny", "Brilliant! Highly recommend",
    "Perfect! Exactly what I needed", "Incredible value for money",
    "Absolutely love it, five stars", "Superb quality and fast shipping",
    "Wonderful experience from start to finish", "Great product, works perfectly",
    "I'm so happy with this purchase", "Fantastic item, love everything about it",
    "Best product in its category", "Highly satisfied, great buy",
    "Excellent quality, very impressed", "Love the design and functionality",
    "Perfect gift, everyone loved it", "Amazing service, will recommend to friends",
    "Great value, excellent product", "So happy I bought this",
    "Wonderful quality, exceeds expectations", "Brilliant product, works great",
    "Love it, best purchase ever", "Outstanding, couldn't be happier",
    "Excellent, worth the price", "Fantastic quality and service",
    "I like this very much", "Really good product",
    "Pretty awesome experience", "Fabulous, totally recommend",
    "Good stuff, happy customer", "Nice product, works well",
    "Decent quality, satisfied", "Fine product, does the job",
    "I enjoy using this", "Pleasant experience overall",
    "Good value for money", "Satisfactory product",
    "Works as expected, happy", "Like it a lot",
    "Pretty good, recommend it", "Solid product, no complaints",
    "Good purchase decision", "Happy with the quality"
] * 2  # Duplicate to get ~100 samples

negative_texts = [
    "This is terrible and disappointing", "Worst product I've ever bought",
    "Complete waste of money", "I hate this, very poor quality",
    "Absolutely horrible experience", "Terrible quality, broke immediately",
    "Worst purchase ever, don't buy", "Awful product, total garbage",
    "Horrible, waste of time and money", "Very disappointed, poor quality",
    "Terrible experience, would not recommend", "Poor quality, not worth it",
    "Bad product, doesn't work", "Worst service ever received",
    "Horrible quality, fell apart quickly", "Terrible, nothing like description",
    "Awful experience, very upset", "Poor design, completely useless",
    "Waste of money, regret buying", "Bad quality, broke after one use",
    "Terrible product, stay away", "Horrible purchase, very disappointed",
    "Poor quality control, defective", "Bad experience from start to finish",
    "Awful service, terrible product", "Worst decision, total waste",
    "Horrible quality, don't waste money", "Terrible, not as advertised",
    "Bad buy, regret this purchase", "Poor workmanship, cheap quality",
    "Awful product, completely useless", "Horrible, broke immediately",
    "Terrible quality, very unhappy", "Bad product, doesn't work properly",
    "Worst experience, avoid this", "Poor value, not worth the price",
    "Not good at all", "Disappointing product",
    "Below average quality", "Not satisfied with this",
    "Poor performance", "Didn't meet expectations",
    "Not happy with purchase", "Subpar quality",
    "Mediocre at best", "Not impressed",
    "Could be better", "Not what I expected",
    "Underwhelming product", "Not worth the hype",
    "Disappointing experience", "Below expectations"
] * 2  # Duplicate to get ~100 samples

neutral_texts = [
    "It's okay, nothing special", "Average product, does the job",
    "Neither good nor bad", "It works as expected",
    "Acceptable quality", "It's fine, just okay",
    "Normal product, nothing extraordinary", "Meets basic expectations",
    "Standard quality, average", "It's alright, not great not bad",
    "Okay product, serves its purpose", "Average, nothing to complain about",
    "It's fine, does what it says", "Standard quality, no surprises",
    "Acceptable, meets requirements", "Okay experience, nothing special",
    "Average quality, fair price", "It's okay, what you'd expect",
    "Normal product, adequate", "Standard, nothing remarkable",
    "Acceptable performance", "Okay value for money",
    "Average experience", "It's fine, works adequately",
    "Standard product, typical quality", "Acceptable, no major issues",
    "Okay, meets basic needs", "Average, fair enough",
    "It's alright, decent", "Normal quality, expected",
    "Acceptable, does the job", "Okay product, reasonable",
    "Average, nothing outstanding", "It's fine, acceptable quality",
    "Standard, meets expectations", "Okay, adequate performance",
    "Moderate quality", "It's decent",
    "Fair product", "Reasonable quality",
    "Just okay", "Passable",
    "Middle of the road", "So-so",
    "Adequate", "Acceptable performance",
    "Nothing special", "Standard fare",
    "Meets expectations", "Fair enough",
    "It'll do", "Serviceable"
] * 2  # Duplicate to get ~100 samples

# Combine all data
all_texts = positive_texts + negative_texts + neutral_texts
all_sentiments = [2] * len(positive_texts) + [0] * len(negative_texts) + [1] * len(neutral_texts)

# Shuffle the data
indices = np.random.permutation(len(all_texts))
data = {
    'text': [all_texts[i] for i in indices],
    'sentiment': [all_sentiments[i] for i in indices]
}

print(f"Dataset size: {len(data['text'])} samples")
print(f"Positive: {data['sentiment'].count(2)}")
print(f"Negative: {data['sentiment'].count(0)}")
print(f"Neutral: {data['sentiment'].count(1)}")