In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, AutoModelForSequenceClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
data = pd.read_csv("/kaggle/input/chat-sentiment-dataset/chat_dataset.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,message,sentiment
0,I really enjoyed the movie,positive
1,The food was terrible,negative
2,I'm not sure how I feel about this,neutral
3,The service was excellent,positive
4,I had a bad experience,negative


In [4]:
data['positive'] = data['sentiment'] == 'positive'
data['negative'] = data['sentiment'] == 'negative'
data['neutral'] = data['sentiment'] == 'neutral'

In [5]:
data = data.drop('sentiment', axis=1)
data.head()

Unnamed: 0,message,positive,negative,neutral
0,I really enjoyed the movie,True,False,False
1,The food was terrible,False,True,False
2,I'm not sure how I feel about this,False,False,True
3,The service was excellent,True,False,False
4,I had a bad experience,False,True,False


In [6]:
labels = ['positive', 'negative','neutral']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [7]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.model_max_length = MAX_LEN

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
def tokenize(sentence, tokenizer):
    tokenized_sentence = []

    sentence = sentence.strip()

    for word in sentence.split():  # Fixed the syntax error here

        # Tokenize the word and count the number of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

    return tokenized_sentence

In [9]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, label2id):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        self.id2label = id2label
        
        
    def __getitem__(self, index):
        sentence = self.data['message'].iloc[index]
        labels = [
            int(self.data['positive'].iloc[index]),
            int(self.data['negative'].iloc[index]),
            int(self.data['neutral'].iloc[index])
        ]

        # Tokenize and align labels
        tokenized_sentence = tokenize(sentence, self.tokenizer)

        # Add special tokens
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]

        # Truncate if exceeding max length
        if len(tokenized_sentence) > self.max_len:
            tokenized_sentence = tokenized_sentence[:self.max_len - 1] + ["[SEP]"]

        while len(tokenized_sentence) < self.max_len:
            tokenized_sentence.append("[PAD]")

        # Attention mask (1 for real tokens, 0 for padding)
        attn_mask = [1 if token != "[PAD]" else 0 for token in tokenized_sentence]

        # Convert tokens and labels to IDs
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

    def __len__(self):
        return len(self.data)

In [10]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, test_size=0.4, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [11]:
training_set = CustomDataset(train_df, tokenizer, MAX_LEN, label2id)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN, label2id)

In [12]:
train_df.head()

Unnamed: 0,message,positive,negative,neutral
0,This weather is crazy âï¸ð§ï¸,False,False,True
1,I'm feeling so blessed to have such amazing pe...,True,False,False
2,The book was not interesting,False,True,False
3,The scenery here is beautiful,True,False,False
4,This is a terrible company,False,True,False


In [13]:
training_set[1]

{'ids': tensor([  101,  1045,  1005,  1049,  3110,  2061, 10190,  2000,  2031,  2107,
          6429,  2111,  1999,  2026,  2166,  1098, 29648,  7737,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [14]:
training_loader = DataLoader(training_set, batch_size = TRAIN_BATCH_SIZE, shuffle = True, num_workers = 0)
testing_loader = DataLoader(testing_set, batch_size = TEST_BATCH_SIZE, shuffle = False, num_workers = 0)

In [15]:
for batch in training_loader:
    print(batch['ids'].shape)
    print(batch['labels'].shape)
    print(batch['mask'].shape)
    print(batch['labels'])
    break

torch.Size([8, 128])
torch.Size([8, 3])
torch.Size([8, 128])
tensor([[1, 0, 0],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 1, 0],
        [0, 0, 1],
        [1, 0, 0]])


In [16]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
with torch.no_grad():  # Prevents computation graph buildup
    for x in testing_loader:
        ids = x["ids"].to(device)
        mask = x["mask"].to(device)
        labels = x["labels"].to(device).float()  # Convert labels to Float type
        model = model.to(device)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        print(outputs)
        
        del ids, mask, labels
        torch.cuda.empty_cache()
        break

SequenceClassifierOutput(loss=tensor(0.6780, device='cuda:0'), logits=tensor([[-0.2510, -0.1886, -0.2394],
        [-0.1720, -0.1742, -0.1425],
        [-0.2324, -0.1588, -0.2045],
        [-0.0756, -0.2277, -0.0721],
        [ 0.0632, -0.2868, -0.0052],
        [-0.1216, -0.1800, -0.1718],
        [-0.2984,  0.0125, -0.1158],
        [-0.2984,  0.0125, -0.1158]], device='cuda:0'), hidden_states=None, attentions=None)


In [18]:
model.config.label2id

{'positive': 0, 'negative': 1, 'neutral': 2}

In [19]:
from transformers import AdamW
import torch.nn as nn
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)



In [20]:
from sklearn.metrics import accuracy_score, classification_report

# Function to calculate metrics
def compute_metrics(preds, labels, id2label):
    preds = (preds >= 0.5).astype(int)  # Apply threshold
    target_names = [id2label[i] for i in range(len(id2label))]  # Extract class names in order
    
    accuracy = accuracy_score(labels, preds)  # Compute accuracy
    report = classification_report(labels, preds, target_names=target_names, zero_division=1)

    print(f"Accuracy: {accuracy:.4f}")
    print(report)

In [21]:
# Training loop
def train_model(model, train_loader):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to GPU if available
        input_ids = batch["ids"].to(device)
        attention_mask = batch["mask"].to(device)
        labels = batch["labels"].to(device).float()  # Labels in their original type

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # Model already computes the loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Loss = {avg_loss:.4f}")

In [22]:
# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["ids"].to(device)
            attention_mask = batch["mask"].to(device)
            labels = batch["labels"].to(device).float()  # Labels in their original type

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss  # Model already computes the loss
            total_loss += loss.item()

            # Convert logits to probabilities
            logits = outputs.logits
            preds = torch.sigmoid(logits).cpu().numpy()
            labels = labels.cpu().numpy()

            all_preds.append(preds)
            all_labels.append(labels)

    avg_loss = total_loss / len(val_loader)

    # Concatenate all predictions and labels

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    
    print(f"Validation Loss = {avg_loss:.4f}")
    # Compute metrics
    compute_metrics(all_preds, all_labels, model.config.id2label)

In [23]:
evaluate_model(model, testing_loader)

Validation Loss = 0.6763
Accuracy: 0.0684
              precision    recall  f1-score   support

    positive       0.35      0.12      0.18        68
    negative       0.24      0.15      0.18        60
     neutral       0.00      0.00      0.00       106

   micro avg       0.25      0.07      0.11       234
   macro avg       0.19      0.09      0.12       234
weighted avg       0.16      0.07      0.10       234
 samples avg       0.81      0.07      0.07       234



In [24]:
for epochs in range(0, EPOCHS):
    train_model(model, training_loader)
    evaluate_model(model, testing_loader)

Loss = 0.6064
Validation Loss = 0.5046
Accuracy: 0.4658
              precision    recall  f1-score   support

    positive       0.90      0.91      0.91        68
    negative       1.00      0.00      0.00        60
     neutral       0.96      0.44      0.61       106

   micro avg       0.92      0.47      0.62       234
   macro avg       0.95      0.45      0.50       234
weighted avg       0.95      0.47      0.54       234
 samples avg       0.96      0.47      0.47       234

Loss = 0.3441
Validation Loss = 0.2171
Accuracy: 0.9060
              precision    recall  f1-score   support

    positive       0.92      0.97      0.94        68
    negative       0.90      0.92      0.91        60
     neutral       0.93      0.88      0.90       106

   micro avg       0.92      0.91      0.92       234
   macro avg       0.92      0.92      0.92       234
weighted avg       0.92      0.91      0.92       234
 samples avg       0.92      0.91      0.91       234

Loss = 0.1505
Vali

In [25]:
print(tokenizer.model_max_length)
print(model.config.id2label)
print(model.config.label2id)

128
{0: 'positive', 1: 'negative', 2: 'neutral'}
{'positive': 0, 'negative': 1, 'neutral': 2}


In [26]:
sentence = "This movie ws ok"

In [30]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=128, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [27]:
import torch

# Tokenization
tokenized_sentence = tokenize(sentence, tokenizer)
print(tokenized_sentence)

# Add special tokens
tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]

# Truncate if too long
max_len = 128  # Define max length
if len(tokenized_sentence) > tokenizer.model_max_length:
    tokenized_sentence = tokenized_sentence[:tokenizer.model_max_length - 1] + ["[SEP]"]

# Pad if too short
while len(tokenized_sentence) < tokenizer.model_max_length:
    tokenized_sentence.append("[PAD]")

print(tokenized_sentence)

# Convert tokens to IDs
attn_mask = [1 if token != "[PAD]" else 0 for token in tokenized_sentence]
ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

# Convert lists to tensors
ids = torch.tensor([ids], dtype=torch.long).to(device)
attn_mask = torch.tensor([attn_mask], dtype=torch.long).to(device)

# Model inference
with torch.no_grad():  # Prevents computation graph buildup
    model = model.to(device)
    outputs = model(input_ids=ids, attention_mask=attn_mask)
    
    # Extract logits & apply sigmoid
    logits = outputs.logits
    preds = torch.sigmoid(logits).cpu().numpy()
    
    # Convert to binary predictions
    preds = (preds >= 0.5).astype(int)
    
    # Map predictions to labels using id2label
    predicted_labels = [model.config.id2label[i] for i, val in enumerate(preds[0]) if val == 1]
    
    print(f"Raw Predictions: {preds}")
    print(f"Predicted Labels: {predicted_labels}")

['this', 'movie', 'w', '##s', 'ok']
['[CLS]', 'this', 'movie', 'w', '##s', 'ok', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '

In [31]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
model_name = "bert-finetuned-single_class_classification"

# Upload tokenizer to the hub
tokenizer.push_to_hub(
    repo_id="ParitKansal/{}".format(model_name),  # Correct repo_id format
    commit_message="Add tokenizer",
    use_temp_dir=True,
)

# Upload model to the hub
model.push_to_hub(
    repo_id="ParitKansal/{}".format(model_name),  # Correct repo_id format
    commit_message="Add model",
    use_temp_dir=True,
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ParitKansal/bert-finetuned-single_class_classification/commit/759ba0e88869b5d082906e07aa761f8f9652d7fb', commit_message='Add model', commit_description='', oid='759ba0e88869b5d082906e07aa761f8f9652d7fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ParitKansal/bert-finetuned-single_class_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='ParitKansal/bert-finetuned-single_class_classification'), pr_revision=None, pr_num=None)

In [33]:
model_ = model

In [34]:
tokenizer_ = tokenizer

In [35]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the pre-trained tokenizer and model
model_name = "ParitKansal/bert-finetuned-single_class_classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [37]:
import torch
sentence = "This movie ws ok"
# Tokenization
tokenized_sentence = tokenize(sentence, tokenizer)
print(tokenized_sentence)

# Add special tokens
tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"]

# Truncate if too long
max_len = 128  # Define max length
if len(tokenized_sentence) > tokenizer.model_max_length:
    tokenized_sentence = tokenized_sentence[:tokenizer.model_max_length - 1] + ["[SEP]"]

# Pad if too short
while len(tokenized_sentence) < tokenizer.model_max_length:
    tokenized_sentence.append("[PAD]")

print(tokenized_sentence)

# Convert tokens to IDs
attn_mask = [1 if token != "[PAD]" else 0 for token in tokenized_sentence]
ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

# Convert lists to tensors
ids = torch.tensor([ids], dtype=torch.long).to(device)
attn_mask = torch.tensor([attn_mask], dtype=torch.long).to(device)

# Model inference
with torch.no_grad():  # Prevents computation graph buildup
    model = model.to(device)
    outputs = model(input_ids=ids, attention_mask=attn_mask)
    
    # Extract logits & apply sigmoid
    logits = outputs.logits
    preds = torch.sigmoid(logits).cpu().numpy()
    
    # Convert to binary predictions
    preds = (preds >= 0.5).astype(int)
    
    # Map predictions to labels using id2label
    predicted_labels = [model.config.id2label[i] for i, val in enumerate(preds[0]) if val == 1]
    
    print(f"Raw Predictions: {preds}")
    print(f"Predicted Labels: {predicted_labels}")

['this', 'movie', 'w', '##s', 'ok']
['[CLS]', 'this', 'movie', 'w', '##s', 'ok', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '