# **Sentiments Analyzer**

In [16]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from datasets import Dataset, DatasetDict

# ✅ Load dataset directly from Kaggle mount path
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target','id','date','flag','user','text']

# ✅ Convert target to label (3 classes)
df['label'] = df['target'].replace({0:0, 2:1, 4:2})
df = df[['text','label']]

# ✅ Use a subset for speed
df = df.sample(30000, random_state=42).reset_index(drop=True)

# ✅ Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_dataset = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

# ✅ Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✅ Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# ✅ Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# ✅ Training Arguments
training_args = TrainingArguments(
    output_dir="./test-output",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# ✅ Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [9]:

# ✅ Train and Evaluate
trainer.train()
trainer.evaluate()

# ✅ Save Model
trainer.save_model('bert_sentiment_model')
tokenizer.save_pretrained("bert_sentiment_model")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4338,0.418425,0.812167,0.811908,0.814979,0.812167
2,0.2705,0.737286,0.824667,0.824664,0.824663,0.824667
3,0.1257,0.945715,0.822667,0.822623,0.822744,0.822667


('bert_sentiment_model/tokenizer_config.json',
 'bert_sentiment_model/special_tokens_map.json',
 'bert_sentiment_model/vocab.txt',
 'bert_sentiment_model/added_tokens.json')

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load model and tokenizer from saved directory
model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")
tokenizer = BertTokenizer.from_pretrained("bert_sentiment_model")

# Set model to evaluation mode
model.eval()
def predict_sentiment(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Optional: label mapping (0: negative, 1: neutral, 2: positive)
    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    return label_map[predicted_class_id]
print(predict_sentiment("I hate this product! It works perfectly."))
print(predict_sentiment("umm i am neutral"))
print(predict_sentiment("Worst experience ever. Totally disappointed."))


negative
positive
negative


In [13]:
import os
os.listdir("bert_sentiment_model")

['config.json',
 'vocab.txt',
 'tokenizer_config.json',
 'training_args.bin',
 'model.safetensors',
 'special_tokens_map.json']