# Unzip the data

In [29]:
!unzip '/content/archive.zip'

Archive:  /content/archive.zip
  inflating: test.csv                
  inflating: testdata.manual.2009.06.14.csv  
  inflating: train.csv               
  inflating: training.1600000.processed.noemoticon.csv  


# Import necessary libraries

In [30]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Load Train and Test data

In [31]:
df_train = pd.read_csv("/content/train.csv", encoding="latin1")
df_test = pd.read_csv("/content/test.csv", encoding="latin1")

In [32]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [33]:
df_test.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [34]:
df_train.shape, df_test.shape

((27481, 10), (4815, 9))

## Consider only the necessary columns

In [35]:
train_data = pd.DataFrame(df_train[['text', 'sentiment']])

test_data = pd.DataFrame(df_test[['text', 'sentiment']])

In [36]:
train_data.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [37]:
# check the distribution of sentiments in train_data
train_data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,11118
positive,8582
negative,7781


In [38]:
test_data.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [39]:
# check the distribution of sentiments in test_data
test_data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,1430
positive,1103
negative,1001


## Check for missing values and remove rows with missing values

### Train data

In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27481 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [41]:
train_data[train_data.isna().any(axis=1)]

Unnamed: 0,text,sentiment
314,,neutral


In [42]:
train_data = train_data.dropna()

In [43]:
train_data[train_data.isna().any(axis=1)]

Unnamed: 0,text,sentiment


### Test data

In [44]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       3534 non-null   object
 1   sentiment  3534 non-null   object
dtypes: object(2)
memory usage: 75.4+ KB


In [45]:
test_data[test_data.isna().any(axis=1)].shape

(1281, 2)

In [46]:
test_data = test_data.dropna()

In [47]:
test_data[test_data.isna().any(axis=1)]

Unnamed: 0,text,sentiment


## Map the labels

In [48]:
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
train_data['label'] = train_data['sentiment'].map(label_map)
test_data['label'] = test_data['sentiment'].map(label_map)

In [49]:
train_data.head()

Unnamed: 0,text,sentiment,label
0,"I`d have responded, if I were going",neutral,1
1,Sooo SAD I will miss you here in San Diego!!!,negative,0
2,my boss is bullying me...,negative,0
3,what interview! leave me alone,negative,0
4,"Sons of ****, why couldn`t they put them on t...",negative,0


## Custom Dataset for the model

In [50]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Check if GPU is available

In [51]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


# Load the pretrained model

In [52]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Preprocess the data for the model

In [53]:
train_dataset = CustomDataset(train_data['text'].tolist(), train_data['label'].tolist(), tokenizer)
test_dataset = CustomDataset(test_data['text'].tolist(), test_data['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Set hyperparameters

In [54]:
epochs = 5
learning_rate = 2e-5

# Train the pretrained model

In [55]:
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [56]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    total_batches = 0
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        total_batches += 1

        # Print loss for batch (only the multiples of 100)
        if batch_idx %100 == 99:
          print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.6f}")

    avg_loss = total_loss / total_batches
    print()
    # Print average loss for each epoch
    print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_loss:.6f}", end = "\n")
    print(50*'-')

Epoch 1/5, Batch 100/859, Loss: 0.828244
Epoch 1/5, Batch 200/859, Loss: 0.568924
Epoch 1/5, Batch 300/859, Loss: 0.406304
Epoch 1/5, Batch 400/859, Loss: 0.572726
Epoch 1/5, Batch 500/859, Loss: 0.545660
Epoch 1/5, Batch 600/859, Loss: 0.710510
Epoch 1/5, Batch 700/859, Loss: 0.442361
Epoch 1/5, Batch 800/859, Loss: 0.252980

Epoch 1/5, Average Training Loss: 0.590406
--------------------------------------------------
Epoch 2/5, Batch 100/859, Loss: 0.589079
Epoch 2/5, Batch 200/859, Loss: 0.442250
Epoch 2/5, Batch 300/859, Loss: 0.448295
Epoch 2/5, Batch 400/859, Loss: 0.481354
Epoch 2/5, Batch 500/859, Loss: 0.487590
Epoch 2/5, Batch 600/859, Loss: 0.201251
Epoch 2/5, Batch 700/859, Loss: 0.577067
Epoch 2/5, Batch 800/859, Loss: 0.422895

Epoch 2/5, Average Training Loss: 0.418476
--------------------------------------------------
Epoch 3/5, Batch 100/859, Loss: 0.257575
Epoch 3/5, Batch 200/859, Loss: 0.524207
Epoch 3/5, Batch 300/859, Loss: 0.206781
Epoch 3/5, Batch 400/859, Loss:

# Evaluate model on the test data

In [57]:
model.eval()
true_labels = []
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]
        true_labels.extend(labels.tolist())
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        predictions.extend(predicted.tolist())

## Print the classification report

In [58]:
print(classification_report(true_labels, predictions, target_names=['negative', 'neutral', 'positive'], digits = 6))

              precision    recall  f1-score   support

    negative   0.776031  0.789211  0.782566      1001
     neutral   0.757353  0.720280  0.738351      1430
    positive   0.808824  0.847688  0.827800      1103

    accuracy                       0.779570      3534
   macro avg   0.780736  0.785726  0.782906      3534
weighted avg   0.778708  0.779570  0.778793      3534

