# Tweet Sentiment Classifier (Binary - PyTorch)
This notebook builds a simple binary sentiment classifier using PyTorch. The model is trained on a subset of Sentiment140 data using only `Negative (0)` and `Positive (1)` classes.


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import re


## Load and Preprocess Data (Binary Sentiment Only)

In [None]:

# Load dataset
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Keep only negative and positive
df = df[df['target'].isin([0, 4])]
df['target'] = df['target'].map({0: 0, 4: 1})

# Simple cleaning
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|@\S+|[^a-zA-Z']", ' ', text)
    return text.lower().strip()

df['text'] = df['text'].apply(clean_text)

# Subset for fast training
df = df.sample(10000, random_state=42).reset_index(drop=True)


## Vectorization and DataLoader

In [None]:

vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class TweetDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TweetDataset(X_train, y_train)
test_dataset = TweetDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


## Build Model and Train

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = nn.Sequential(
    nn.Linear(1000, 128),
    nn.ReLU(),
    nn.Linear(128, 2)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


## Evaluation

In [None]:

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))
