<a href="https://colab.research.google.com/github/ShivangiChy/Sentiment-Analysis/blob/main/Sentiment%20Analysis%20using%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install scikit-learn


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch


In [None]:
from datasets import load_dataset

dataset = load_dataset("imdb")

print(dataset)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize_function, batched=True)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(dataset['train'][0])


In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix
import torch
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
print(f"Train Set Size: {len(dataset['train'])}")
print(f"Test Set Size: {len(dataset['test'])}")


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

print(model)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluate every epoch
    save_strategy="epoch",           # Save model every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for optimization
    logging_dir='./logs',            # Logging directory
)


In [None]:
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=dataset['train'],      # Training dataset
    eval_dataset=dataset['test'],       # Evaluation dataset
    tokenizer=tokenizer                  # Tokenizer to preprocess text
)


In [None]:
trainer.train()


In [None]:
eval_results = trainer.evaluate()

print(eval_results)


In [None]:

predictions = trainer.predict(dataset['test'])

# Get predicted labels
preds = np.argmax(predictions.predictions, axis=1)


true_labels = predictions.label_ids


print(classification_report(true_labels, preds, target_names=['negative', 'positive']))

# confusion matrix
cm = confusion_matrix(true_labels, preds)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'positive'], yticklabels=['negative', 'positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Save the model and tokenizer
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_tokenizer')


In [None]:
# Check the dataset structure
print(dataset)

In [None]:
# Display the first 5 rows of the training dataset
print(dataset['train'][:5])


In [None]:
# Count positive and negative sentiment labels in the training data
positive_count_train = sum(dataset['train']['label'] == 1)
negative_count_train = sum(dataset['train']['label'] == 0)

# Count positive and negative sentiment labels in the test data
positive_count_test = sum(dataset['test']['label'] == 1)
negative_count_test = sum(dataset['test']['label'] == 0)

print(f"Training set - Positive: {positive_count_train}, Negative: {negative_count_train}")
print(f"Test set - Positive: {positive_count_test}, Negative: {negative_count_test}")


In [None]:
from datasets import load_dataset

# Step 1: Load the IMDb dataset
dataset = load_dataset('imdb')

# Step 2: Map the label (0, 1) to sentiment (negative, positive)
def map_sentiment(label):
    if label == 1:
        return "positive"
    else:
        return "negative"

# Step 3: Add sentiment labels to the dataset (train and test)
dataset['train'] = dataset['train'].map(lambda x: {'sentiment': map_sentiment(x['label'])})
dataset['test'] = dataset['test'].map(lambda x: {'sentiment': map_sentiment(x['label'])})


for i in range(5):  # Loop through the first 5 entries
    review = dataset['train'][i]
    print(f"Review: {review['text']}")
    print(f"Sentiment: {review['sentiment']}")
    print('-' * 100)


In [None]:
from datasets import load_dataset

dataset = load_dataset('imdb')

def map_sentiment(label):
    if label == 1:
        return "positive"
    else:
        return "negative"

dataset['train'] = dataset['train'].map(lambda x: {'sentiment': map_sentiment(x['label'])})
dataset['test'] = dataset['test'].map(lambda x: {'sentiment': map_sentiment(x['label'])})

positive_count = 0
negative_count = 0

for review in dataset['train']:
    if positive_count < 5 and review['sentiment'] == 'positive':
        print(f"Review: {review['text']}")
        print(f"Sentiment: {review['sentiment']}")
        print('-' * 100)
        positive_count += 1

    if negative_count < 5 and review['sentiment'] == 'negative':
        print(f"Review: {review['text']}")
        print(f"Sentiment: {review['sentiment']}")
        print('-' * 100)
        negative_count += 1

    if positive_count >= 5 and negative_count >= 5:
        break


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import numpy as np

# pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('./sentiment_model')  # Path to your trained model directory

=def predict_sentiment(text):
    # Tokenize and encode the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)


    with torch.no_grad():
        outputs = model(**inputs)


    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    # Map
    sentiment = "positive" if prediction == 1 else "negative"

    return sentiment

#testing with custom sentences
new_statement = "Wow, another groundbreaking movie—if groundbreaking means incredibly boring."
predicted_sentiment = predict_sentiment(new_statement)

print(f"Sentiment: {predicted_sentiment}")
