In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -U bertviz
!pip install -U umap-learn
!pip install seaborn --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv")

In [None]:
df.info()
df.isnull().sum()

In [None]:
df['sentiment'].value_counts()

# Dataset Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
label_counts = df['sentiment'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df['Words per Tweet'] = df['review'].str.split().apply(len)
df.boxplot("Words per Tweet", by="sentiment")

#Text to Token Conversion

In [None]:
from transformers import AutoTokenizer

In [None]:
model_ckpt = "bert-base-uncased"  #ENGLISH english
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


text = "We love to sleep! Winters are awesome!"
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
len(tokenizer.vocab),tokenizer.vocab_size, tokenizer.model_max_length

# Data Loader and Train Test Split

In [None]:

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, stratify=df['sentiment'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['sentiment'])
train.shape, test.shape, validation.shape

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_pandas(train, preserve_index=False),
    "test": Dataset.from_pandas(test, preserve_index=False),
    "validation": Dataset.from_pandas(validation, preserve_index=False)
})

In [None]:
dataset

#Tokenization of emotion/sentiment data

In [None]:
dataset['train'][0]

In [None]:
def tokenize(batch):
    temp= tokenizer(batch['review'], padding=True, truncation=True, return_token_type_ids=True,)
    return temp

print(tokenize(dataset['train'][:2]))

In [None]:
emotion_encoded=dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
label2id = {x['sentiment']:x['sentiment'] for x in dataset ['train']}
id2label = {v:k for k,v in label2id.items()}
label2id, id2label

In [None]:
label2id = {"positive": 0, "negative": 1, "neutral": 2}  # Update as needed

# Apply the mapping to all splits
emotion_encoded = emotion_encoded.map(lambda x: {"sentiment": label2id[x["sentiment"]]})

#Model Building

In [None]:
from transformers import AutoModel
import torch

In [None]:
model= AutoModel.from_pretrained(model_ckpt)

In [None]:
model

In [None]:
model.config
model_two="bert-base-cased"

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig
Num_labels = len(label2id)
device = torch.device ("cuda" if torch. cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
device


In [None]:
model

In [None]:
from transformers import TrainingArguments

batch_size=8
training_dir = "bert_base_training_dir"
training_args= TrainingArguments(output_dir = training_dir,
              overwrite_output_dir = True,
              num_train_epochs = 2,
              learning_rate = 2e-5,
              per_device_train_batch_size = batch_size,
              per_device_eval_batch_size = batch_size,
              weight_decay = 0.01,
              evaluation_strategy = 'epoch',
              disable_tqdm = False
    )

In [None]:
! pip install evaluate

In [None]:
print(emotion_encoded["train"].features)

In [None]:
emotion_encoded = emotion_encoded.rename_column("sentiment", "labels")


other than the above method, you can also use sklearn to calc accuracy

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics (pred) :
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

#Build Model and Trainer

In [None]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=emotion_encoded['train'],
    eval_dataset=emotion_encoded['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
preds_output = trainer predict(emotion_encoded[ 'test'])
preds_output.metrics

In [None]:
preds_output.predictions

In [None]:
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = emotion_encoded['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
label2id

In [None]:
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, xticklabels=label2id.keys(), yticklabels=label2id.keys(), fmt ='d', cbar=False, cmap='Reds')
plt.yabel("Actual")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.show()

#Build Prediction function and save model

In [None]:
model_path = '/content/drive/MyDrive/Colab Notebooks/bert-base-uncased-sentiment-model'


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model
model = BertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer (optional, if needed for inference)
tokenizer = BertTokenizer.from_pretrained(model_path)


In [None]:
import torch

# Sample text
text = "the movie was good!"

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted class
predicted_class = outputs.logits.argmax(dim=-1).item()
if(predicted_class==0):
  print("Positive")
else:
  print("Negative")



In [None]:
id2label={1:'Positive',0: 'Negative'}

In [None]:
text = "I am super happy today"
def get_prediction (text) :
    input_encoded = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
      outputs = model(**input_encoded)
    logits = outputs. logits
    pred = torch.argmax(logits, dim=1) .item()
    return id2label[pred]
get_prediction (text)

In [None]:
trainer.save_model("bert-base-uncased-sentiment-model")

In [None]:
from transformers import pipeline, AutoTokenizer, BertForSequenceClassification

# Define paths
model_path = '/content/drive/MyDrive/bert-base-uncased-sentiment-model'

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/bert-base-uncased-sentiment-model')
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Colab Notebooks/bert-base-uncased-sentiment-model')

# Create the pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Perform predictions
text = "This is a great day!"
predictions = classifier([text, 'A very good day indeed', "We loved MTH111", "I am feeling anxious"])

# Print predictions
for prediction in predictions:
    print(prediction)
