In [2]:
!pip install pytorch-lightning
!pip install torchmetrics



# Import

In [3]:
import pytorch_lightning as pl
import torch
import os
import kagglehub
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from torchmetrics.functional import accuracy, precision, recall
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.feature_extraction.text import HashingVectorizer

# Dataset

## About Dataset
## Context
This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

## Content
It contains the following 6 fields:

- target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

- ids: The id of the tweet ( 2087)

- date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

- flag: The query (lyx). If there is no query, then this value is NO_QUERY.

- user: the user that tweeted (robotickilldozr)

- text: the text of the tweet (Lyx is cool)

## Acknowledgements
The official link regarding the dataset with resources about how it was generated is here
The official paper detailing the approach is here

Citation: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.

In [4]:
# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sentiment140' dataset.
Path to dataset files: /kaggle/input/sentiment140


In [5]:
dataset = pd.read_csv(path + f"/{os.listdir(path)[0]}", names = ['target','ids','date','flag','user','text'], encoding = 'latin-1')
dataset.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
vectorizer = HashingVectorizer(n_features = 2**16)
X_dataset = dataset['text']
target = dataset['target']

# Dataset class and Wrap

In [16]:
class Mydataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return torch.tensor(self.x[idx], dtype = torch.float32) , torch.tensor(self.y[idx], dtype = torch.long)

In [8]:
#made the split into the [X_train, y_train], [X_test, y_test]
X_train, X_test, y_train, y_test = train_test_split(X_dataset, target, random_state = 42, test_size = 0.2)

#we also want the [X_val, y_val] with dimension 50% of the train sets
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, random_state =42, test_size = 0.5)

X_train = X_train[:50000]
X_test = X_test[:5000]
X_val = X_val[:5000]
#Text Hashing vectorizer
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_val = vectorizer.transform(X_val).toarray()

In [9]:
#Instanciate the class dataset with our actual datasets, it'important that type(data) = np.ndarray
data_train = Mydataset(X_train, y_train)
data_test = Mydataset(X_test, y_test)
data_val = Mydataset(X_val, y_val)

#wrap the data with the Dataloader
wrapped_train = DataLoader(data_train, shuffle = True, batch_size = 32)
wrapped_test = DataLoader(data_test, shuffle = False, batch_size = 32)
wrapped_val = DataLoader(data_val, shuffle = False, batch_size = 32)

# Model Design

In [10]:
class MLP(pl.LightningModule):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features = input_dim, out_features = 256),
            nn.Dropout(p = 0.3),
            nn.ReLU(),
            nn.Linear(in_features = 256, out_features = 128),
            nn.Dropout(p = 0.3),
            nn.ReLU(),
            nn.Linear(in_features = 128, out_features = 64),
            nn.Dropout(p = 0.3),
            nn.ReLU(),
            nn.Linear(in_features = 64, out_features = 1),
            nn.Sigmoid()
        )
    def forward(self,x):
        return self.model(x).view(-1)
    
    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.01)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        BCE = nn.BCELoss()
        y_hat = self(x)
        loss = BCE(y_hat, y.float())
        self.log('train_loss', loss, prog_bar = True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        BCE = nn.BCELoss()
        y_hat = self(x)
        pred = torch.round(y_hat)
        loss = BCE(y_hat, y.float())
        acc = accuracy(pred, y, task = 'binary')
        prec = precision(pred, y, task = 'binary')
        rec = recall(pred, y, task = 'binary')

        self.log('val_loss', loss, prog_bar = True)
        self.log('val_acc', acc, prog_bar = True)
        self.log('val_prec', prec)
        self.log('val_rec', rec)

        return loss

In [17]:
input_dim = X_train.shape[1] #where X_train is the raw dataset
model = MLP(input_dim)

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints/",
    save_top_k=2,
    monitor="val_acc",
    mode="max"
)
early_stopping = EarlyStopping(
    monitor="val_acc",
    min_delta=0.1,
    patience=2,
    verbose=False,
    mode="max"
)

trainer = pl.Trainer(
    max_epochs=1000,
    callbacks=[early_stopping, checkpoint_callback],
    log_every_n_steps=1,
    deterministic=True
)

print(model)
trainer.fit(model, data_train, data_val)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


MLP(
  (model): Sequential(
    (0): Linear(in_features=65536, out_features=256, bias=True)
    (1): Dropout(p=0.3, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): Dropout(p=0.3, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=64, bias=True)
    (7): Dropout(p=0.3, inplace=False)
    (8): ReLU()
    (9): Linear(in_features=64, out_features=1, bias=True)
    (10): Sigmoid()
  )
)


Output()

AttributeError: module 'torch' has no attribute 'tesor'