In [None]:
# Install necessary libraries
!pip install transformers datasets torch scikit-learn

# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset
df = pd.read_csv("/content/sentiment_analysis_1 (1).csv")
print(df.head())

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Create a PyTorch Dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, list(train_labels))
val_dataset = SentimentDataset(val_encodings, list(val_labels))

# Load the model and move to CPU
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3  # Assuming 3 sentiment classes: Negative, Neutral, Positive
)

# Import necessary libraries
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments

# Define training arguments with GPU and increased batch size
training_args = TrainingArguments(
    output_dir="./results",            # Directory to save model checkpoints
    num_train_epochs= 5,               # Max number of epochs
    per_device_train_batch_size=32,    # Batch size for training
    per_device_eval_batch_size=64,     # Batch size for evaluation
    warmup_steps=500,                  # Warmup steps for learning rate scheduler
    weight_decay=0.01,                 # Weight decay for regularization
    logging_dir="./logs",              # Directory for logging
    evaluation_strategy="epoch",       # Evaluate at the end of each epoch
    save_strategy="epoch",             # Save model at the end of each epoch
    save_total_limit=2,                # Keep only the 2 latest checkpoints
    load_best_model_at_end=True,       # Load the best model at the end of training
    logging_steps=100,
    report_to="none",                  # Disable reporting for now
    fp16=True                          # Enable mixed precision for faster training on GPU
)

# Ensure the model and data are moved to GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU. Consider using a GPU for faster training.")

# Move the model to the selected device
model = model.to(device)

# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Define Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,                       # The model
    args=training_args,                # Training arguments
    train_dataset=train_dataset,       # Training dataset
    eval_dataset=val_dataset,          # Validation dataset
    tokenizer=tokenizer,               # Tokenizer
    compute_metrics=compute_metrics,   # Metrics function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement after 2 epochs
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Validation results:", results)


                       text  label
0            ['borderland']      2
1                ['border']      2
2            ['borderland']      2
3  ['borderland', 'murder']      2
4            ['borderland']      2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Using GPU: Tesla T4


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7107,0.661464,0.731128,0.729193
2,0.4442,0.478422,0.822396,0.822271
3,0.2667,0.434037,0.850906,0.850985


In [9]:
# Stop training if you're done
trainer.save_model("/content/drive/MyDrive/Distilbert-base-uncased")  # Save model
tokenizer.save_pretrained("/content/drive/MyDrive/Distilbert-base-uncased")  # Save tokenizer

('/content/drive/MyDrive/Distilbert-base-uncased/tokenizer_config.json',
 '/content/drive/MyDrive/Distilbert-base-uncased/special_tokens_map.json',
 '/content/drive/MyDrive/Distilbert-base-uncased/vocab.txt',
 '/content/drive/MyDrive/Distilbert-base-uncased/added_tokens.json',
 '/content/drive/MyDrive/Distilbert-base-uncased/tokenizer.json')

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [11]:
def predict_sentiment(texts):
    # Tokenize input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply softmax to get probabilities
    probs = F.softmax(outputs.logits, dim=-1)

    # Get class probabilities and format the output
    results = []
    for i, prob in enumerate(probs):
        result = {
            "Text": texts[i],
            "Probabilities": {
                "Negative": round(prob[0].item(), 4),
                "Neutral": round(prob[1].item(), 4),
                "Positive": round(prob[2].item(), 4),
            }
        }
        results.append(result)
    return results


In [12]:
texts = [
    "I love this product!",
    "This is the worst experience ever.",
    "guvi has the best data science road map in industry"
]

predictions = predict_sentiment(texts)

# Display results
for prediction in predictions:
    print(f"Text: {prediction['Text']}")
    for sentiment, prob in prediction["Probabilities"].items():
        print(f"  {sentiment}: {prob}")
    print()


Text: I love this product!
  Negative: 0.0445
  Neutral: 0.0423
  Positive: 0.9133

Text: This is the worst experience ever.
  Negative: 0.8653
  Neutral: 0.0566
  Positive: 0.078

Text: guvi has the best data science road map in industry
  Negative: 0.0302
  Neutral: 0.2911
  Positive: 0.6788



In [13]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [1]:
#!pip install gradio flask

import gradio as gr
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from flask import request
import logging
from datetime import datetime

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set up logging
logging.basicConfig(
    filename="user_logs.txt",  # Logs will be saved to this file
    level=logging.INFO,
    format="%(asctime)s - %(message)s",
)

def log_user_details(user_ip, input_text):
    log_message = f"IP: {user_ip} | Input: {input_text}"
    logging.info(log_message)
    print(log_message)  # Optional: Print logs for testing in Colab

def predict_sentiment(text):
    # Get user IP address (only works with Flask requests)
    user_ip = request.remote_addr if request else "Unknown IP"
    log_user_details(user_ip, text)

    # Predict sentiment
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1)
    # Get class probabilities and format the output
    result = {
        "Negative": round(probs[0][0].item(), 4),
        "Neutral": round(probs[0][1].item(), 4),
        "Positive": round(probs[0][2].item(), 4),
    }
    return result

iface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs=gr.JSON(),
    title="Sentiment Analysis",
    description="Enter some text and get the predicted sentiment probabilities."
)

iface.launch(share=True)  # `share=True` creates a public URL


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e2d74908a9317b1f56.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [2]:
# Read the log file and print its content
log_file_path = "user_logs.txt"

try:
    with open(log_file_path, "r") as log_file:
        logs = log_file.readlines()
        print("User Logs:")
        for line in logs:
            print(line.strip())  # Remove extra newline characters
except FileNotFoundError:
    print(f"Log file '{log_file_path}' not found.")


Log file 'user_logs.txt' not found.
