In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv
/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv


### Install Dependencies

In [2]:
!pip install torch torchvision torchaudio torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Co

## Load and Show the dataset

In [3]:
import pandas as pd

# File paths
twitter_path = "/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv"
reddit_path = "/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv"

# Load datasets
df_twitter = pd.read_csv(twitter_path)
# .drop(columns=['clean_comment'])
df_reddit = pd.read_csv(reddit_path).rename(columns={'clean_comment' : 'clean_text'})
# .drop(columns=['clean_comment'])

# Display shapes
print(f"Twitter Data shape: {df_twitter.shape}")
print(f"Reddit Data shape: {df_reddit.shape}")

# Concatenate datasets
df = pd.concat([df_twitter, df_reddit], axis=0).reset_index(drop=True)

# Display final shape and preview
print(f"Combined Data shape: {df.shape}")
print(df.head())


Twitter Data shape: (162980, 2)
Reddit Data shape: (37249, 2)
Combined Data shape: (200229, 2)
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0


In [4]:
df_twitter.columns

Index(['clean_text', 'category'], dtype='object')

In [5]:
df_reddit.columns

Index(['clean_text', 'category'], dtype='object')

### Perform Preprocessing Task

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd
import nltk
nltk.download('punkt')

# Load dataset
# df = pd.read_csv("Twitter_Data.csv")  # Replace with your dataset path
df = df.dropna()
df['label'] = df['category'].astype('category').cat.codes

# Custom tokenizer using nltk
def tokenizer(text):
    return nltk.word_tokenize(text.lower())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# Build vocabulary manually
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        counter.update(tokenizer(text))
    vocab = {"<unk>": 0, "<pad>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

print("Building vocabulary...")
vocab = build_vocab(X_train)
print(f"Vocabulary size: {len(vocab)}")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Building vocabulary...
Vocabulary size: 116684


### Getting the data ready & Perform the training

In [7]:
# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, tokenizer, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        ids = [self.vocab.get(token, self.vocab["<unk>"]) for token in tokens][:self.max_len]
        ids += [self.vocab["<pad>"]] * (self.max_len - len(ids))
        return torch.tensor(ids), torch.tensor(self.labels[idx])

# DataLoader
print("Preparing DataLoaders...")
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), vocab, tokenizer)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), vocab, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        return self.fc(hidden[-1])

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = y_train.nunique()

model = LSTMModel(vocab_size, embed_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Starting training...\n")
# Training loop
for epoch in range(1, 6):
    model.train()
    total_loss = 0
    batch_count = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batch_count += 1
        if batch_count % 1000 == 0:
            print(f"Epoch {epoch} | Batch {batch_count} | Batch Loss: {loss.item():.4f}")
    print(f"Epoch {epoch} Completed | Avg Loss: {total_loss / batch_count:.4f}\n")

Preparing DataLoaders...
Starting training...

Epoch 1 | Batch 1000 | Batch Loss: 1.0729
Epoch 1 | Batch 2000 | Batch Loss: 0.6141
Epoch 1 | Batch 3000 | Batch Loss: 0.5082
Epoch 1 | Batch 4000 | Batch Loss: 0.3578
Epoch 1 | Batch 5000 | Batch Loss: 0.1966
Epoch 1 Completed | Avg Loss: 0.6370

Epoch 2 | Batch 1000 | Batch Loss: 0.5426
Epoch 2 | Batch 2000 | Batch Loss: 0.4038
Epoch 2 | Batch 3000 | Batch Loss: 0.1507
Epoch 2 | Batch 4000 | Batch Loss: 0.4252
Epoch 2 | Batch 5000 | Batch Loss: 0.1272
Epoch 2 Completed | Avg Loss: 0.2367

Epoch 3 | Batch 1000 | Batch Loss: 0.0245
Epoch 3 | Batch 2000 | Batch Loss: 0.0443
Epoch 3 | Batch 3000 | Batch Loss: 0.2106
Epoch 3 | Batch 4000 | Batch Loss: 0.0350
Epoch 3 | Batch 5000 | Batch Loss: 0.2007
Epoch 3 Completed | Avg Loss: 0.1492

Epoch 4 | Batch 1000 | Batch Loss: 0.0431
Epoch 4 | Batch 2000 | Batch Loss: 0.0726
Epoch 4 | Batch 3000 | Batch Loss: 0.0529
Epoch 4 | Batch 4000 | Batch Loss: 0.1123
Epoch 4 | Batch 5000 | Batch Loss: 0.1178

### Perform Evaluation on Trained Model

In [8]:
# Evaluation
print("Starting evaluation...\n")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for i, (X_batch, y_batch) in enumerate(test_loader):
        X_batch = X_batch.to(device)
        output = model(X_batch)
        preds = torch.argmax(output, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())
        if i % 120 == 0:
            print(f"Processed batch {i+1} in test set")

accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")


Starting evaluation...

Processed batch 1 in test set
Processed batch 121 in test set
Processed batch 241 in test set
Processed batch 361 in test set
Processed batch 481 in test set
Processed batch 601 in test set
Processed batch 721 in test set
Processed batch 841 in test set
Processed batch 961 in test set
Processed batch 1081 in test set
Processed batch 1201 in test set

Test Accuracy: 0.9574


### Save the model

In [9]:
# Save model state
model_path = "/kaggle/working/lstm_sentiment_model.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': vocab,  # optional: save vocab for inference
    'embed_dim': embed_dim,
    'hidden_dim': hidden_dim,
    'output_dim': output_dim
}, model_path)

print(f"Model saved to {model_path}")


Model saved to /kaggle/working/lstm_sentiment_model.pth


### Load the saved model and evaluate

In [11]:
# Load model state
checkpoint = torch.load("/kaggle/working/lstm_sentiment_model.pth", map_location=device)

# Recreate model architecture
model1 = LSTMModel(len(vocab), checkpoint['embed_dim'], checkpoint['hidden_dim'], checkpoint['output_dim'])
model1.load_state_dict(checkpoint['model_state_dict'])
model1.to(device)
model1.eval()

print("Model loaded and ready for inference.")


  checkpoint = torch.load("/kaggle/working/lstm_sentiment_model.pth", map_location=device)


Model loaded and ready for inference.


In [13]:
from sklearn.metrics import classification_report, accuracy_score

all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model1(X_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Classification report
print("Evaluation Report:\n")
print(classification_report(all_labels, all_preds))

# Accuracy
acc = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {acc:.4f}")


Evaluation Report:

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      8615
           1       0.98      0.97      0.98     13630
           2       0.97      0.96      0.96     17779

    accuracy                           0.96     40024
   macro avg       0.95      0.95      0.95     40024
weighted avg       0.96      0.96      0.96     40024

Accuracy: 0.9574
