In [1]:
import pandas as pd

train_df = pd.read_csv("tweet_eval_train.csv")
val_df   = pd.read_csv("tweet_eval_validation.csv")
test_df  = pd.read_csv("tweet_eval_test.csv")


In [2]:
def minimal_clean(df):
    df = df.dropna(subset=["text"])
    df["clean_text"] = df["text"].str.strip().str.lower()
    return df


In [3]:
train_df = minimal_clean(train_df)
val_df   = minimal_clean(val_df)
test_df  = minimal_clean(test_df)

In [4]:
print(train_df[["text", "clean_text"]].head())
print(val_df[["text", "clean_text"]].head())
print(test_df[["text", "clean_text"]].head())

                                                text  \
0  "QT @user In the original draft of the 7th boo...   
1  "Ben Smith / Smith (concussion) remains out of...   
2  Sorry bout the stream last night I crashed out...   
3  Chase Headley's RBI double in the 8th inning o...   
4  @user Alciato: Bee will invest 150 million in ...   

                                          clean_text  
0  "qt @user in the original draft of the 7th boo...  
1  "ben smith / smith (concussion) remains out of...  
2  sorry bout the stream last night i crashed out...  
3  chase headley's rbi double in the 8th inning o...  
4  @user alciato: bee will invest 150 million in ...  
                                                text  \
0  Dark Souls 3 April Launch Date Confirmed With ...   
1  "National hot dog day, national tequila day, t...   
2  When girls become bandwagon fans of the Packer...   
3  @user I may or may not have searched it up on ...   
4  Here's your starting TUESDAY MORNING Line up a... 

In [5]:
print(train_df.columns)

Index(['text', 'label', 'clean_text'], dtype='object')


In [6]:
train_df["label"].value_counts(normalize=True)*100

label
1    45.320618
2    39.129672
0    15.549710
Name: proportion, dtype: float64

In [7]:
val_df["label"].value_counts(normalize=True)*100

label
1    43.45
2    40.95
0    15.60
Name: proportion, dtype: float64

In [8]:
test_df["label"].value_counts(normalize=True)*100

label
1    48.331162
0    32.334744
2    19.334093
Name: proportion, dtype: float64

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
sample_text = train_df["clean_text"].iloc[0]
print("Text:",sample_text)
print("Tokens:",tokenizer.tokenize(sample_text))

Text: "qt @user in the original draft of the 7th book, remus lupin survived the battle of hogwarts. #happybirthdayremuslupin"
Tokens: ['"', 'q', '##t', '@', 'user', 'in', 'the', 'original', 'draft', 'of', 'the', '7th', 'book', ',', 're', '##mus', 'lu', '##pin', 'survived', 'the', 'battle', 'of', 'hog', '##wart', '##s', '.', '#', 'happy', '##bir', '##th', '##day', '##rem', '##us', '##lu', '##pin', '"']


In [12]:
encoded = tokenizer(
    sample_text,
    padding = "max_length",
    truncation = True,
    max_length = 20,
    return_tensors = "pt"
)
encoded

{'input_ids': tensor([[  101,  1000,  1053,  2102,  1030,  5310,  1999,  1996,  2434,  4433,
          1997,  1996,  5504,  2338,  1010,  2128,  7606, 11320,  8091,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
encoded["input_ids"]
encoded["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [14]:
MAX_LEN=128

train_encodings = tokenizer(
    train_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)
val_encodings = tokenizer(
    val_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)
test_encodings= tokenizer(
    test_df["clean_text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)


In [15]:
train_labels = train_df["label"].tolist()
val_labels = val_df["label"].tolist()
test_labels = test_df["label"].tolist()

In [16]:
import torch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [17]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

In [18]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
sample = train_dataset[0]

with torch.no_grad():
    output = model(
        input_ids = sample["input_ids"].unsqueeze(0),
        attention_mask = sample["attention_mask"].unsqueeze(0)
    )
output.logits

tensor([[-0.0787, -0.0919,  0.1474]])

In [20]:
tokenizer.save_pretrained("save_tokenizer")

('save_tokenizer\\tokenizer_config.json',
 'save_tokenizer\\special_tokens_map.json',
 'save_tokenizer\\vocab.txt',
 'save_tokenizer\\added_tokens.json',
 'save_tokenizer\\tokenizer.json')

In [21]:
import pickle

with open("train_encodings.pkl", "wb") as f:
    pickle.dump(train_encodings, f)

with open("val_encodings.pkl", "wb") as f:
    pickle.dump(val_encodings, f)

with open("test_encodings.pkl", "wb") as f:
    pickle.dump(test_encodings, f)

with open("labels.pkl", "wb") as f:
    pickle.dump((train_labels, val_labels, test_labels), f)


In [22]:
import torch 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("usage device:",device)
model.to(device)

usage device: cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [23]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())


True
1


In [24]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


2.5.1+cu121
True
NVIDIA GeForce RTX 2050


In [25]:
import sys
print(sys.executable)


C:\Users\Vijay\AppData\Local\Programs\Python\Python310\python.exe


In [26]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("save_tokenizer")


In [27]:
import pickle

with open("train_encodings.pkl", "rb") as f:
    train_encodings = pickle.load(f)

with open("val_encodings.pkl", "rb") as f:
    val_encodings = pickle.load(f)

with open("test_encodings.pkl", "rb") as f:
    test_encodings = pickle.load(f)

with open("labels.pkl", "rb") as f:
    train_labels, val_labels, test_labels = pickle.load(f)

In [28]:
from torch.utils.data import DataLoader
BATCH_SIZE=16
train_loader=DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)
val_loader=DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [29]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights

array([2.14366277, 0.73550041, 0.85186845])

In [30]:
import numpy as np

unique, counts = np.unique(train_labels, return_counts=True)
print(dict(zip(unique, counts)))


{np.int64(0): np.int64(7093), np.int64(1): np.int64(20673), np.int64(2): np.int64(17849)}


In [31]:
class_weights=torch.tensor(class_weights,dtype=torch.float).to(device)
class_weights

tensor([2.1437, 0.7355, 0.8519], device='cuda:0')

In [32]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [33]:
from torch.optim import AdamW
optimizer=AdamW(
    model.parameters(),
    lr = 2e-5
)

In [34]:
model.train()
batch = next(iter(train_loader))

input_ids=batch["input_ids"].to(device)
attention_mask=batch["attention_mask"].to(device)
labels=batch["labels"].to(device)

optimizer.zero_grad()
outputs=model(
    input_ids=input_ids,
    attention_mask=attention_mask
    )
loss=criterion(outputs.logits,labels)
loss.backward()
optimizer.step()
loss

tensor(1.0878, device='cuda:0', grad_fn=<NllLossBackward0>)

In [40]:
EPOCHS = 1

In [41]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")


    
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = criterion(outputs.logits, labels)
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print("Training loss:", avg_train_loss)

    
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss = criterion(outputs.logits, labels)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print("Validation loss:", avg_val_loss)



Epoch 1/1
Training loss: 0.507747250177575
Validation loss: 0.6567128905653954


In [38]:
import torch

torch.save(model.state_dict(), "best_model_epoch1.pt")
print("Best model saved successfully")


Best model saved successfully


### Day-3 Summary
- Best validation loss: 0.6217
- Model saved as: best_model_epoch1.pt
- Training stopped due to overfitting
- Epoch 1/1
-  Training loss: 0.6862482709535086
-  Validation loss: 0.6217153009176254
