<a href="https://colab.research.google.com/github/SrinathMLOps/MLPractise/blob/main/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install and import everything we need
%pip install transformers datasets scikit-learn torch gradio

import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
from sklearn.metrics import accuracy_score
import gradio as gr
import os

print("✅ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Create a simple dataset for training
print("📊 Creating sample dataset...")

# Sample movie reviews with sentiment labels
texts = [
    "This movie is absolutely fantastic! I loved every minute of it.",
    "Terrible film, waste of time and money.",
    "Amazing cinematography and brilliant acting throughout.",
    "Boring and predictable storyline, very disappointing.",
    "One of the best movies I've ever seen, highly recommended!",
    "Poor script and bad direction, couldn't wait for it to end.",
    "Great entertainment value, fun for the whole family.",
    "Awful movie, terrible acting and confusing plot.",
    "Beautifully crafted story with excellent character development.",
    "Complete disaster, worst movie of the year.",
    "Exceptional film with outstanding performances.",
    "Dull and uninteresting, fell asleep halfway through.",
    "Masterpiece of cinema, truly inspiring and moving.",
    "Horrible experience, regret watching this garbage.",
    "Wonderful storytelling and beautiful visuals.",
    "Pathetic attempt at filmmaking, completely unwatchable.",
    "Incredible movie that touched my heart deeply.",
    "Stupid plot with terrible special effects.",
    "Brilliant direction and superb soundtrack.",
    "Waste of talent, poor execution throughout."
] * 5  # Repeat to have more samples

# Labels: 1 = Positive, 0 = Negative
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 5

# Split into train and test
train_size = int(0.8 * len(texts))
train_texts = texts[:train_size]
train_labels = labels[:train_size]
test_texts = texts[train_size:]
test_labels = labels[train_size:]

print(f"✅ Dataset created!")
print(f"   Train samples: {len(train_texts)}")
print(f"   Test samples: {len(test_texts)}")
print(f"   Positive samples: {sum(train_labels + test_labels)}")
print(f"   Negative samples: {len(labels) - sum(train_labels + test_labels)}")


In [None]:
# Load tokenizer and model
print("🤖 Loading model and tokenizer...")

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

print("✅ Model and tokenizer loaded!")

# Tokenize datasets
print("🔤 Tokenizing datasets...")

def tokenize_data(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

# Create datasets
train_encodings = tokenize_data(train_texts)
test_encodings = tokenize_data(test_texts)

# Convert to Dataset objects
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

print("✅ Tokenization completed!")
print(f"   Train dataset: {train_dataset}")
print(f"   Test dataset: {test_dataset}")


In [None]:
# Universal training function that works with any transformers version
print("🚀 Starting training...")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Try different training argument configurations
training_success = False

# Method 1: Try with eval_strategy (newer versions)
if not training_success:
    try:
        print("Trying Method 1: eval_strategy parameter...")
        training_args = TrainingArguments(
            output_dir='./results',
            eval_strategy='epoch',
            save_strategy='epoch',
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=2,
            weight_decay=0.01,
            logging_steps=10,
            save_total_limit=2,
            report_to=None,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        training_success = True
        print("✅ Training completed with eval_strategy!")

    except Exception as e:
        print(f"Method 1 failed: {e}")

# Method 2: Try with evaluation_strategy (older versions)
if not training_success:
    try:
        print("Trying Method 2: evaluation_strategy parameter...")
        training_args = TrainingArguments(
            output_dir='./results',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=2,
            weight_decay=0.01,
            logging_steps=10,
            save_total_limit=2,
            report_to=None,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        training_success = True
        print("✅ Training completed with evaluation_strategy!")

    except Exception as e:
        print(f"Method 2 failed: {e}")

# Method 3: Minimal training (fallback)
if not training_success:
    try:
        print("Trying Method 3: minimal training arguments...")
        training_args = TrainingArguments(
            output_dir='./results',
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            num_train_epochs=2,
            report_to=None,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        training_success = True
        print("✅ Training completed with minimal arguments!")

    except Exception as e:
        print(f"Method 3 failed: {e}")

if training_success:
    print("🎉 Training completed successfully!")

    # Save the model
    model.save_pretrained('./trained_model')
    tokenizer.save_pretrained('./trained_model')
    print("✅ Model saved to './trained_model'")

    # Evaluate
    eval_results = trainer.evaluate()
    print(f"📊 Evaluation results: {eval_results}")
else:
    print("❌ All training methods failed!")


In [None]:
# Load the trained model for inference
print("📥 Loading trained model for inference...")

# Load model and tokenizer
inference_model = AutoModelForSequenceClassification.from_pretrained('./trained_model')
inference_tokenizer = AutoTokenizer.from_pretrained('./trained_model')

inference_model.eval()
print("✅ Model loaded for inference!")

# Define prediction function
def predict_sentiment(text):
    """Predict sentiment of input text."""
    if not text.strip():
        return "Please enter some text", 0.0

    # Tokenize input
    inputs = inference_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    # Make prediction
    with torch.no_grad():
        outputs = inference_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        confidence, predicted_class = torch.max(predictions, dim=-1)

    sentiment = "Positive 😊" if predicted_class.item() == 1 else "Negative 😞"
    confidence_score = confidence.item()

    return sentiment, confidence_score

# Test the model
test_texts = [
    "This movie is absolutely fantastic!",
    "I hated every moment of this film.",
    "It was okay, nothing special.",
    "Amazing story with great acting!",
    "Boring and predictable plot."
]

print("\n🧪 Testing the model:")
print("=" * 50)
for text in test_texts:
    sentiment, confidence = predict_sentiment(text)
    print(f"Text: '{text}'")
    print(f"Prediction: {sentiment} (Confidence: {confidence:.3f})")
    print("-" * 50)

print("✅ Inference testing completed!")


In [None]:
# Create Gradio interface
print("🎭 Creating Gradio interface...")

def gradio_predict(text):
    """Gradio prediction function."""
    if not text.strip():
        return "Please enter some text to analyze.", 0.0

    try:
        sentiment, confidence = predict_sentiment(text)

        # Format result
        result = f"""
### 🎭 Sentiment Analysis Result

**Text:** "{text}"

**Prediction:** {sentiment}

**Confidence:** {confidence:.1%}
        """

        return result, confidence
    except Exception as e:
        return f"Error: {str(e)}", 0.0

# Create the interface
with gr.Blocks(title="Sentiment Analysis", theme=gr.themes.Soft()) as demo:
    gr.HTML("""
    <div style="text-align: center; padding: 20px;">
        <h1>🎭 Movie Review Sentiment Analysis</h1>
        <p>Enter any text to analyze its sentiment (Positive or Negative)</p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Enter text to analyze",
                placeholder="Type your movie review or any text here...",
                lines=3
            )
            predict_btn = gr.Button("Analyze Sentiment", variant="primary")

        with gr.Column(scale=1):
            confidence_output = gr.Number(
                label="Confidence Score",
                precision=3
            )

    result_output = gr.Markdown(
        label="Analysis Result",
        value="Enter text above and click 'Analyze Sentiment' to see results."
    )

    # Examples
    gr.Examples(
        examples=[
            ["This movie is absolutely amazing! Best film I've ever seen."],
            ["Terrible movie, complete waste of time and money."],
            ["It was okay, nothing too special but not bad either."],
            ["Brilliant acting and fantastic storyline throughout."],
            ["Boring plot with poor character development."]
        ],
        inputs=text_input
    )

    # Event handlers
    predict_btn.click(
        fn=gradio_predict,
        inputs=text_input,
        outputs=[result_output, confidence_output]
    )

    text_input.submit(
        fn=gradio_predict,
        inputs=text_input,
        outputs=[result_output, confidence_output]
    )

print("✅ Gradio interface created!")
print("🚀 Launching interface...")

# Launch the interface
demo.launch(
    share=True,  # Creates a public URL
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True
)
