In [23]:
# mathematical operations-linear Algebra
import numpy as np 

# dataframe
import pandas as pd 

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score

In [24]:
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

In [25]:
def setup_cuda():
    """Setup CUDA device with proper error handling"""
    if torch.cuda.is_available():
        # Set CUDA device
        try:
            torch.cuda.init()
            # Clear GPU cache
            torch.cuda.empty_cache()
            device = torch.device('cuda')
            # Set device index to 0 if multiple GPUs
            torch.cuda.set_device(0)
            # Enable debugging for CUDA
            torch.cuda.set_sync_debug_mode(True)
            print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
        except RuntimeError as e:
            print(f"CUDA initialization failed: {str(e)}")
            print("Falling back to CPU")
            device = torch.device('cpu')
    else:
        device = torch.device('cpu')
        print("CUDA not available, using CPU")
    return device

In [26]:
# # Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [27]:
# ignore warnings
import warnings
warnings.filterwarnings(action='ignore')


In [28]:
# Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
import seaborn as sns

In [29]:
# NLTK
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
# Word2vec
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [31]:
# Utility
import string
import re
import os
from collections import Counter
import logging
import time
import pickle
import itertools
import random
import datetime

In [32]:
# WordCloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter, defaultdict

In [33]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert text to tensor with safety checks
        tokens = []
        for word in text:
            token_id = self.vocab.get(word, self.vocab['<UNK>'])
            # Ensure token_id is within valid range
            if token_id >= len(self.vocab):
                token_id = self.vocab['<UNK>']
            tokens.append(token_id)
        
        # Truncate or pad sequence
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens = tokens + [self.vocab['<PAD>']] * (self.max_len - len(tokens))
        
        return (
            torch.tensor(tokens, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

In [34]:
# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

In [35]:
# Data loading and preprocessing
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
		
df = pd.read_csv("Twitter_Data.csv")

In [36]:
# Fill missing values and drop duplicates
df['clean_text'].fillna('', inplace=True)
df.dropna(subset=['category'], inplace=True)
df.drop_duplicates(inplace=True)

In [37]:
# Tokenization and vocabulary creation
tokenized_texts = [word_tokenize(text.lower()) for text in df['clean_text']]

In [38]:
# Create vocabulary
word_freq = Counter([word for text in tokenized_texts for word in text])
vocab = {'<PAD>': 0, '<UNK>': 1}
vocab.update({word: idx + 2 for idx, (word, freq) in enumerate(word_freq.items()) if freq > 1})

In [39]:
# Encode categories
encoder = LabelEncoder()
labels = encoder.fit_transform(df['category'])

In [40]:
# Calculate max sequence length
max_seq_length = min(max(len(text) for text in tokenized_texts), 100)  # Cap at 100 tokens

In [41]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(tokenized_texts, labels, test_size=0.2, random_state=42)

In [42]:
# Create datasets
train_dataset = TweetDataset(X_train, y_train, vocab, max_seq_length)
test_dataset = TweetDataset(X_test, y_test, vocab, max_seq_length)

In [43]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256)

In [44]:
# Initialize model
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(encoder.classes_)
model = LSTMClassifier(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, vocab['<PAD>'])
model = model.to(device)

In [45]:
# Initialize optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [46]:
# # Training setup with error handling
# def setup_training(df, test_size=0.2, batch_size=32, embedding_dim=100, hidden_dim=128):
#     # Data preprocessing
#     df['clean_text'].fillna('', inplace=True)
#     df.dropna(subset=['category'], inplace=True)
#     df.drop_duplicates(inplace=True)
    
#     # Tokenization
#     tokenized_texts = [word_tokenize(str(text).lower()) for text in df['clean_text']]
    
#     # Create vocabulary with frequency threshold
#     word_freq = Counter([word for text in tokenized_texts for word in text])
#     min_freq = 2  # Minimum frequency threshold
#     vocab = {'<PAD>': 0, '<UNK>': 1}
#     vocab.update({word: idx + 2 for idx, (word, freq) in enumerate(word_freq.items()) 
#                  if freq >= min_freq})
    
#     # Encode categories
#     encoder = LabelEncoder()
#     labels = encoder.fit_transform(df['category'])
    
#     # Calculate max sequence length
#     max_seq_length = min(max(len(text) for text in tokenized_texts), 100)
    
#     # Create train/test split
#     X_train, X_test, y_train, y_test = train_test_split(
#         tokenized_texts, labels, test_size=test_size, random_state=42, stratify=labels
#     )
    
#     # Create datasets
#     train_dataset = TweetDataset(X_train, y_train, vocab, max_seq_length)
#     test_dataset = TweetDataset(X_test, y_test, vocab, max_seq_length)
    
#     # Create dataloaders with error handling
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
#                             num_workers=0, pin_memory=True)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0, 
#                            pin_memory=True)
    
#     # Initialize model
#     model = LSTMClassifier(
#         vocab_size=len(vocab),
#         embedding_dim=embedding_dim,
#         hidden_dim=hidden_dim,
#         output_dim=len(encoder.classes_),
#         pad_idx=vocab['<PAD>']
#     )
    
#     return model, train_loader, test_loader, encoder

In [47]:
# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in iterator:
        optimizer.zero_grad()
        
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        
        predictions = model(text)
        loss = criterion(predictions, labels)
        
        acc = accuracy_score(labels.cpu().numpy(), predictions.argmax(dim=1).cpu().numpy())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [48]:
# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            text, labels = text.to(device), labels.to(device)
            
            predictions = model(text)
            loss = criterion(predictions, labels)
            
            acc = accuracy_score(labels.cpu().numpy(), predictions.argmax(dim=1).cpu().numpy())
            
            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [49]:
# Training loop
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.903 | Train Acc: 56.36%
	Valid Loss: 0.599 | Valid Acc: 76.07%
Epoch: 02
	Train Loss: 0.482 | Train Acc: 82.00%
	Valid Loss: 0.338 | Valid Acc: 88.51%
Epoch: 03
	Train Loss: 0.321 | Train Acc: 89.04%
	Valid Loss: 0.250 | Valid Acc: 92.05%
Epoch: 04
	Train Loss: 0.243 | Train Acc: 92.00%
	Valid Loss: 0.197 | Valid Acc: 93.81%
Epoch: 05
	Train Loss: 0.195 | Train Acc: 93.68%
	Valid Loss: 0.166 | Valid Acc: 95.02%
Epoch: 06
	Train Loss: 0.162 | Train Acc: 94.83%
	Valid Loss: 0.145 | Valid Acc: 95.67%
Epoch: 07
	Train Loss: 0.141 | Train Acc: 95.54%
	Valid Loss: 0.134 | Valid Acc: 96.05%
Epoch: 08
	Train Loss: 0.124 | Train Acc: 96.05%
	Valid Loss: 0.126 | Valid Acc: 96.34%
Epoch: 09
	Train Loss: 0.111 | Train Acc: 96.47%
	Valid Loss: 0.118 | Valid Acc: 96.59%
Epoch: 10
	Train Loss: 0.100 | Train Acc: 96.86%
	Valid Loss: 0.113 | Valid Acc: 96.78%
