<a href="https://colab.research.google.com/github/VatsalRaina01/Sentiment-analysis-with-BERT/blob/main/sentimentanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download dataset
current_folder = os.getcwd()
dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz",
    origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    cache_dir=current_folder,
    extract=True
)

dataset_path = os.path.dirname(dataset)
dataset_dir = os.path.join(dataset_path, 'aclImdb')
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

# Load dataset
def load_dataset(directory):
    data = {"sentence": [], "sentiment": []}
    for file_name in os.listdir(directory):
        if file_name == 'pos':
            for text_file in os.listdir(os.path.join(directory, file_name)):
                with open(os.path.join(directory, file_name, text_file), "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(1)
        elif file_name == 'neg':
            for text_file in os.listdir(os.path.join(directory, file_name)):
                with open(os.path.join(directory, file_name, text_file), "r", encoding="utf-8") as f:
                    data["sentence"].append(f.read())
                    data["sentiment"].append(0)
    return pd.DataFrame(data)

train_df = load_dataset(train_dir)
test_df = load_dataset(test_dir)

# Plot sentiment counts
sentiment_counts = train_df['sentiment'].value_counts()
fig = px.bar(
    x={0: 'Negative', 1: 'Positive'},
    y=sentiment_counts.values,
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.qualitative.Dark24,
    title='<b>Sentiment Counts'
)
fig.update_layout(template='plotly_dark')
pyo.plot(fig, filename='Sentiments_Counts.html', auto_open=True)

# Clean text
def text_cleaning(text):
    soup = BeautifulSoup(text, "html.parser")
    text = re.sub(r'\[[^]]*\]', '', soup.get_text())
    text = re.sub(r"[^a-zA-Z0-9\s,']", '', text)
    return text

train_df['Cleaned_sentence'] = train_df['sentence'].apply(text_cleaning)
test_df['Cleaned_sentence'] = test_df['sentence'].apply(text_cleaning)

# Word clouds
def generate_wordcloud(text, title):
    all_text = " ".join(text)
    wordcloud = WordCloud(width=800, height=400, stopwords=set(STOPWORDS), background_color='black').generate(all_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()

generate_wordcloud(train_df[train_df['sentiment']==1]['Cleaned_sentence'], 'Positive Review')
generate_wordcloud(train_df[train_df['sentiment']==0]['Cleaned_sentence'], 'Negative Review')

# Prepare train/val/test
Reviews = train_df['Cleaned_sentence']
Target = train_df['sentiment']
test_reviews = test_df['Cleaned_sentence']
test_targets = test_df['sentiment']

x_val, x_test, y_val, y_test = train_test_split(test_reviews, test_targets, test_size=0.5, stratify=test_targets)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

max_len = 128
X_train_encoded = tokenizer.batch_encode_plus(Reviews.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='tf')
X_val_encoded = tokenizer.batch_encode_plus(x_val.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='tf')
X_test_encoded = tokenizer.batch_encode_plus(x_test.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='tf')

# Model (TF-ready)
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english',
    num_labels=2,
    from_pt=False
)

# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train
history = model.fit(
    [X_train_encoded['input_ids'], X_train_encoded['attention_mask']],
    Target,
    validation_data=([X_val_encoded['input_ids'], X_val_encoded['attention_mask']], y_val),
    batch_size=32,
    epochs=2
)

# Evaluate
test_loss, test_accuracy = model.evaluate(
    [X_test_encoded['input_ids'], X_test_encoded['attention_mask']],
    y_test
)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')

# Save
path = '/content'
tokenizer.save_pretrained(path + '/Tokenizer')
model.save_pretrained(path + '/Model')

# Load back
distil_tokenizer = DistilBertTokenizer.from_pretrained(path + '/Tokenizer')
distil_model = TFDistilBertForSequenceClassification.from_pretrained(path + '/Model', from_pt=False)

# Prediction example
def Get_sentiment(Review, Tokenizer=distil_tokenizer, Model=distil_model):
    if not isinstance(Review, list):
        Review = [Review]
    encoded = Tokenizer.batch_encode_plus(Review, padding=True, truncation=True, max_length=128, return_tensors='tf')
    prediction = Model.predict([encoded['input_ids'], encoded['attention_mask']])
    label_map = {1: 'positive', 0: 'negative'}
    pred_labels = tf.argmax(prediction.logits, axis=1).numpy().tolist()
    return [label_map[i] for i in pred_labels]

Review = """Bahubali is a blockbuster Indian movie that was released in 2015.
It is the first part of a two-part epic saga that tells the story of a legendary hero who fights for his kingdom and his love.
The movie has received rave reviews from critics and audiences alike for its stunning visuals, spectacular action scenes, and captivating storyline."""
print(Get_sentiment(Review))
