# 🤖 Sentiment Analysis using BERT
This notebook performs sentiment classification on Twitter data using a pre-trained BERT model.

In [None]:
# 📦 Install dependencies (Run this in Colab or your local machine)
#!pip install transformers datasets scikit-learn
#!pip install transformers[torch]
#!pip install 'accelerate>=0.26.0'



In [2]:
# 📚 Import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import torch

In [3]:
# 📁 Load dataset
df = pd.read_csv('twitter_training.csv', header=None)
df = df[[3, 2]].rename(columns={3: "text", 2: "label"})
df = df.dropna(subset=["text", "label"])  # drop rows with NaN in text or label
df["text"] = df["text"].astype(str)       # ensure text is string
df.head()


Unnamed: 0,text,label
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [4]:
# 🏷️ Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [5]:
# 🔀 Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

In [6]:
# 🔄 Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})

In [7]:
# 🧠 Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/59196 [00:00<?, ? examples/s]

Map:   0%|          | 0/14800 [00:00<?, ? examples/s]

In [8]:
# 🎯 Prepare dataset for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [9]:
# 🤖 Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10
)


In [11]:
# 🏋️‍♂️ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Step,Training Loss
10,1.3846
20,1.37
30,1.3603
40,1.3421
50,1.2629
60,1.2949
70,1.2631
80,1.1493
90,1.2347
100,1.1572


KeyboardInterrupt: 

In [12]:
# 💾 Save the trained model
model.save_pretrained("saved_bert_model")
tokenizer.save_pretrained("saved_bert_model")

# Also save just the model weights for Streamlit
torch.save(model.state_dict(), "model.pth")
