<a href="https://colab.research.google.com/github/SeniyaSultan/SentimentClassifier/blob/main/app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import sys
# if "google.colab" in sys.modules:
#     !pip install transformers scikit-learn gradio torch matplotlib


In [3]:
# STEP 2: Import libraries
import random
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [4]:
#  STEP 3: Generate synthetic eco-tweets
positive_data = [f"This eco-friendly product is amazing! #{i}" for i in range(500)]
negative_data = [f"This green product is a total scam. #{i}" for i in range(500)]

df_pos = pd.DataFrame({'text': positive_data, 'label': 1})
df_neg = pd.DataFrame({'text': negative_data, 'label': 0})
df = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)

In [5]:
#  STEP 4: Split train/val
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [6]:
# STEP 5: Tokenization with BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
#  STEP 6: Dataset class
class EcoTweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

train_dataset = EcoTweetDataset(train_encodings, train_labels)
val_dataset = EcoTweetDataset(val_encodings, val_labels)

In [8]:

# STEP 7: Load BERT
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#  STEP 8: Training setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,                # Can be 1 for fast test
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"                  # no wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [10]:
# STEP 9: Train the model
trainer.train()

  return forward_call(*args, **kwargs)


Step,Training Loss
10,0.3551
20,0.0238
30,0.0029
40,0.001
50,0.0006
60,0.0004
70,0.0004
80,0.0003
90,0.0003
100,0.0003


TrainOutput(global_step=150, training_loss=0.025753703856219848, metrics={'train_runtime': 345.1648, 'train_samples_per_second': 6.953, 'train_steps_per_second': 0.435, 'total_flos': 17266663008000.0, 'train_loss': 0.025753703856219848, 'epoch': 3.0})

In [11]:
# Save model and tokenizer to local folder
model.save_pretrained("eco-sentiment-bert")
tokenizer.save_pretrained("eco-sentiment-bert")


('eco-sentiment-bert/tokenizer_config.json',
 'eco-sentiment-bert/special_tokens_map.json',
 'eco-sentiment-bert/vocab.txt',
 'eco-sentiment-bert/added_tokens.json',
 'eco-sentiment-bert/tokenizer.json')

In [12]:
!pip install gradio transformers torch matplotlib --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
from transformers import pipeline
import gradio as gr
import matplotlib.pyplot as plt

In [14]:
classifier = pipeline("sentiment-analysis")

label_emojis = {
    "POSITIVE": "🌿 Positive",
    "NEGATIVE": "🚫 Negative"
}


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [15]:
# Generate the bar chart plot
def create_bar_plot(scores):
    fig, ax = plt.subplots()
    ax.bar(scores.keys(), scores.values(), color=['green', 'red'])
    ax.set_ylabel('Confidence (%)')
    ax.set_ylim([0, 100])
    ax.set_title("Sentiment Confidence Chart")
    return fig

# Main prediction function
def classify_sentiment(text):
    result = classifier(text)[0]
    label = result['label']
    score = round(result['score'] * 100, 2)

    # Assign confidence to both sentiments for plotting
    scores = {
        "Positive": score if label == "POSITIVE" else 100 - score,
        "Negative": 100 - score if label == "POSITIVE" else score
    }

    # Explanation sentence
    explanation = f"📝 The model predicts the text is **{label_emojis[label]}** with **{score}%** confidence."

    return (
        f"{label_emojis[label]}",
        f"{score}%",
        create_bar_plot(scores),
        explanation
    )

# Examples
examples = [
    ["This biodegradable plastic is incredible!"],
    ["This product is just another green scam."],
    ["I love this eco toothbrush."],
    ["This company cares about the environment."],
    ["Not impressed with this 'eco' bag."]
]


In [16]:
# Gradio Interface
interface = gr.Interface(
    fn=classify_sentiment,
    inputs=gr.Textbox(lines=3, placeholder="Enter your review or comment here...", label="Input Text"),
    outputs=[
        gr.Textbox(label="Sentiment"),
        gr.Textbox(label="Confidence"),
        gr.Plot(label="Confidence Chart"),
        gr.Markdown(label="Explanation")
    ],
    examples=examples,
    title="🌍 Sentiment Classifier for Eco Product Reviews",
    description="Analyze sentiment (Positive or Negative) about eco-friendly product reviews with confidence score and chart."
)

interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://21cc924d1eaea4e6b0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


