### 1. Install Required Libararies

In [10]:
# !pip install torch torchtext pandas scikit-learn matplotlib --quiet

### 2. Import Libraries

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import re
import string


### 3. Load and Clean Data

In [12]:
# Load data
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding="latin-1", header=None)
df.columns = ["target", "ids", "date", "flag", "user", "text"]

# Map target values to classes
df = df[df["target"].isin([0, 2, 4])]
df["target"] = df["target"].map({0: 0, 2: 1, 4: 2})  # 0 = neg, 1 = neutral, 2 = pos

# Keep only necessary columns
df = df[["text", "target"]]
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle
df.head()

Unnamed: 0,text,target
0,@AriCostello Asia Trip: Please send detailed w...,2
1,@sherwinsteel and I still didn't get anything,0
2,@merchantofmagic I do remember my first magic ...,2
3,Now this just getting sad.. got my car back.. ...,0
4,@heykim Awwww... I'm so sorry to hear about th...,0


### 4. Text Preprocessing

In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

df["cleaned_text"] = df["text"].apply(clean_text)

### 5. Tokenization and Vocabulary

In [14]:
from collections import Counter

def tokenize(text):
    return text.split()

# Build vocab
token_counts = Counter()
for sentence in df["cleaned_text"]:
    token_counts.update(tokenize(sentence))

vocab = {word: i+2 for i, (word, _) in enumerate(token_counts.most_common(10000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

def encode(text):
    return [vocab.get(word, vocab["<UNK>"]) for word in tokenize(text)]


### 6. Dataset and DataLoader

In [15]:
MAX_LEN = 30

class TweetDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [self.pad(encode(text)) for text in texts]
        self.labels = labels

    def pad(self, encoded):
        return encoded[:MAX_LEN] + [0] * (MAX_LEN - len(encoded))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])

X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["target"], test_size=0.2, stratify=df["target"])

train_dataset = TweetDataset(X_train.tolist(), y_train.tolist())
test_dataset = TweetDataset(X_test.tolist(), y_test.tolist())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


### 7. Define the Model

In [16]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim * MAX_LEN, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        return self.fc2(x)

model = SentimentClassifier(len(vocab), 50, 64, 3)


### 8. Train the Model

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(3):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 9365.2202
Epoch 2, Loss: 8456.1155
Epoch 3, Loss: 8137.9499


### 9. Evaluate the Model

In [9]:
from sklearn.metrics import classification_report
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print(classification_report(
    all_labels, 
    all_preds, 
    labels=[0, 1, 2], 
    target_names=["Negative", "Neutral", "Positive"],
    zero_division=0
    ))


              precision    recall  f1-score   support

    Negative       0.81      0.79      0.80    160000
     Neutral       0.00      0.00      0.00         0
    Positive       0.80      0.81      0.81    160000

   micro avg       0.80      0.80      0.80    320000
   macro avg       0.54      0.54      0.54    320000
weighted avg       0.80      0.80      0.80    320000

