<a href="https://colab.research.google.com/github/Ranjithakishore/guvi-ml-project/blob/main/ml_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# from google.colab import drive
# drive.flush_and_unmount()

Mounted at /content/drive


In [None]:
# 🛠️ Install required packages (Run this cell first)
!pip install fastapi uvicorn transformers torch nest-asyncio pyngrok

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cu

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import os
import time
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, accuracy_score, f1_score, roc_auc_score

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else list(texts)
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

def preprocess_text(text):
    """Clean and normalize text data."""
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

def create_labels(df):
    """Create multi-label categories based on review content with 10 categories."""
    labels = []

    for _, row in df.iterrows():
        review = row['combined_text'].lower()
        # Initialize 10 categories: [Product Quality, Customer Service, Price, Functionality,
        # Technical Issues, Shipping/Delivery, User Experience, Product Compatibility,
        # Product Features, Others]
        label = [0] * 10

        # 1. Product Quality
        if any(word in review for word in [
            'quality', 'durable', 'reliable', 'broken', 'defect', 'build', 'material',
            'sturdy', 'flimsy', 'solid', 'construction', 'craftsmanship', 'made well',
            'poor quality', 'high quality', 'wear and tear', 'lasting'
        ]):
            label[0] = 1

        # 2. Customer Service
        if any(word in review for word in [
            'service', 'support', 'help', 'customer', 'return', 'refund', 'warranty',
            'representative', 'agent', 'response', 'contact', 'assistance', 'helpful',
            'unhelpful', 'responsive', 'communication', 'customer care', 'service desk'
        ]):
            label[1] = 1

        # 3. Price
        if any(word in review for word in [
            'price', 'cost', 'expensive', 'cheap', 'value', 'worth', 'affordable',
            'overpriced', 'bargain', 'discount', 'deal', 'money', 'pricing',
            'investment', 'budget', 'premium', 'economical', 'pricey'
        ]):
            label[2] = 1

        # 4. Functionality
        if any(word in review for word in [
            'work', 'function', 'feature', 'performance', 'capability', 'operates',
            'working', 'functional', 'operation', 'performing', 'works well',
            'doesn\'t work', 'stopped working', 'malfunctioning', 'operational'
        ]):
            label[3] = 1

        # 5. Technical Issues
        if any(word in review for word in [
            'bug', 'error', 'crash', 'glitch', 'problem', 'issue', 'malfunction',
            'freeze', 'stuck', 'technical', 'software', 'hardware', 'failure',
            'not working', 'broken', 'repair', 'fix', 'troubleshoot'
        ]):
            label[4] = 1

        # 6. Shipping/Delivery
        if any(word in review for word in [
            'shipping', 'delivery', 'arrived', 'package', 'shipment', 'late',
            'damaged', 'tracking', 'carrier', 'box', 'packaging', 'shipped',
            'transit', 'arrival', 'delayed', 'on time', 'shipping speed'
        ]):
            label[5] = 1

        # 7. User Experience
        if any(word in review for word in [
            'easy', 'difficult', 'simple', 'complicated', 'intuitive', 'user friendly',
            'confusing', 'straightforward', 'complex', 'learning curve', 'usability',
            'convenient', 'inconvenient', 'experience', 'interface', 'accessibility'
        ]):
            label[6] = 1

        # 8. Product Compatibility
        if any(word in review for word in [
            'compatible', 'compatibility', 'works with', 'fit', 'fits', 'connection',
            'connect', 'paired', 'sync', 'integration', 'supported', 'incompatible',
            'version', 'system', 'device', 'platform', 'setup'
        ]):
            label[7] = 1

        # 9. Product Features
        if any(word in review for word in [
            'feature', 'specification', 'specs', 'capability', 'option', 'setting',
            'configuration', 'customization', 'design', 'functionality', 'built-in',
            'included', 'additional', 'extra', 'advanced', 'basic', 'innovative'
        ]):
            label[8] = 1

        # 10. Others (catch-all category for reviews that don't fit above categories
        # or contain general feedback)
        if (sum(label) == 0) or any(word in review for word in [
            'recommend', 'suggestion', 'feedback', 'general', 'overall',
            'impression', 'thought', 'opinion', 'review', 'comment',
            'miscellaneous', 'other', 'else', 'additional'
        ]):
            label[9] = 1

        # print(f"Processing review: {review}", label)

        labels.append(label)

        # print(f"Processed final labels:", labels)

    return np.array(labels)

def train_model(model, train_loader, val_loader, device, num_epochs=1):
    """Train the BERT model."""
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()

        print(f'Epoch {epoch + 1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')

def evaluate_model(model, test_loader, device):
    """Evaluate the model using various metrics."""
    model.eval()
    all_preds = []
    all_labels = []
    start_time = time.time()

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.sigmoid(outputs.logits)
            preds = (preds > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate metrics
    hamming = hamming_loss(all_labels, all_preds)
    subset_acc = accuracy_score(all_labels, all_preds)
    micro_f1 = f1_score(all_labels, all_preds, average='micro')
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    # Calculate AUC-ROC for each category
    auc_scores = []
    for i in range(all_labels.shape[1]):
        auc = roc_auc_score(all_labels[:, i], all_preds[:, i])
        auc_scores.append(auc)

    inference_time = time.time() - start_time

    return {
        'hamming_loss': hamming,
        'subset_accuracy': subset_acc,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'auc_scores': auc_scores,
        'inference_time': inference_time
    }

def main():
    # Load and preprocess data
    print("Loading data...")
    # json_file_path = os.path.join('data', 'electronics_reviews.json')
    json_file_path = '/content/drive/MyDrive/electronics_reviews.json'
    df = pd.read_json(json_file_path, lines=True)

    # Use only 50000 records for testing
    df = df.sample(n=50000, random_state=42)
    print(f"Using {len(df)} records for testing")

    # Combine review text and summary
    df['combined_text'] = df['reviewText'] + ' ' + df['summary']

    # Clean text
    df['combined_text'] = df['combined_text'].apply(preprocess_text)

    # Create labels
    labels = create_labels(df)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['combined_text'], labels, test_size=0.2, random_state=42
    )

    # Initialize tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=10,
    problem_type="multi_label_classification"
    )
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # model = BertForSequenceClassification.from_pretrained(
    #     'bert-base-uncased',
    #     num_labels=10,
    #     problem_type="multi_label_classification"
    # )

    # Create datasets
    train_dataset = ReviewDataset(X_train, y_train, tokenizer)
    test_dataset = ReviewDataset(X_test, y_test, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8)

    print(f"Training samples: {len(train_dataset)}")
    print(f"Testing samples: {len(test_dataset)}")

    # Train model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    print("Training model...")
    train_model(model, train_loader, test_loader, device)

    # Evaluate model
    print("Evaluating model...")
    metrics = evaluate_model(model, test_loader, device)

    print("\nModel Evaluation Results:")
    print(f"Hamming Loss: {metrics['hamming_loss']:.4f}")
    print(f"Subset Accuracy: {metrics['subset_accuracy']:.4f}")
    print(f"Micro F1 Score: {metrics['micro_f1']:.4f}")
    print(f"Macro F1 Score: {metrics['macro_f1']:.4f}")
    print("\nAUC-ROC Scores for each category:")
    categories = ['Product Quality', 'Customer Service', 'Price', 'Functionality', 'Technical Issues', 'Shipping/Delivery', 'User Experience', 'Product Compatibility', 'Product Features', 'Others']
    for cat, score in zip(categories, metrics['auc_scores']):
        print(f"{cat}: {score:.4f}")
    print(f"\nInference Time: {metrics['inference_time']:.2f} seconds")

    # Save the model
    print("\nSaving model...")
    model.save_pretrained('trained_model')
    tokenizer.save_pretrained('trained_model')
    print("Model saved successfully!")

if __name__ == "__main__":
    main()


Loading data...
Using 50000 records for testing


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training samples: 40000
Testing samples: 10000
Training model...
Epoch 1:
Training Loss: 0.2309
Validation Loss: 0.1425
Evaluating model...

Model Evaluation Results:
Hamming Loss: 0.0612
Subset Accuracy: 0.7289
Micro F1 Score: 0.9001
Macro F1 Score: 0.8845

AUC-ROC Scores for each category:
Product Quality: 0.9061
Customer Service: 0.8497
Price: 0.9125
Functionality: 0.9376
Technical Issues: 0.8885
Shipping/Delivery: 0.8394
User Experience: 0.8911
Product Compatibility: 0.9325
Product Features: 0.8846
Others: 0.9293

Inference Time: 48.80 seconds

Saving model...
Model saved successfully!


In [6]:
from pyngrok import ngrok
ngrok.set_auth_token("2x5wSYflpa9b37pfbw5gQRHxgYR_WAWzKdUuM6S8UCQ4vrKg")

In [7]:
# 🧠 Main FastAPI App with BERT
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# Enable nested event loop for Colab
nest_asyncio.apply()

# Start FastAPI app
app = FastAPI()

# Load your model (make sure the path is correct or use HF hub model)
model_path = '/content/trained_model'  # Replace or mount from Drive if needed
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model.eval()

# Define category labels
categories = [
    'Product Quality', 'Customer Service', 'Price', 'Functionality',
    'Technical Issues', 'Shipping/Delivery', 'User Experience',
    'Product Compatibility', 'Product Features', 'Others'
]

# Request/Response schema
class PredictionRequest(BaseModel):
    text: str

class PredictionResponse(BaseModel):
    categories: dict

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
    # Tokenize input text
    inputs = tokenizer(
        request.text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits)
        predictions = (predictions > 0.5).float()
        print('pred:', predictions, 'out:', outputs);

    # Format result
    result = {
        categories[i]: bool(predictions[0][i])
        for i in range(len(categories))
    }

    return {"categories": result}

In [None]:
# 4️⃣ Run the server and expose it with ngrok (run this last and DON’T STOP it)
import nest_asyncio
import uvicorn

nest_asyncio.apply()

# Expose the app
public_url = ngrok.connect(8000)
print(f"🚀 Your FastAPI app is available at: {public_url}/docs")

# Start the app (this keeps running – don’t interrupt it)
uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [1136]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


🚀 Your FastAPI app is available at: NgrokTunnel: "https://e0cc-34-16-232-204.ngrok-free.app" -> "http://localhost:8000"/docs
INFO:     149.34.244.158:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     149.34.244.158:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     149.34.244.158:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     149.34.244.158:0 - "GET /openapi.json HTTP/1.1" 200 OK
pred: tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]) out: SequenceClassifierOutput(loss=None, logits=tensor([[-4.8173,  3.7862, -5.7159, -4.2057, -4.1174, -4.6465, -5.1989, -6.1129,
         -4.9765, -4.4091]]), hidden_states=None, attentions=None)
INFO:     149.34.244.158:0 - "POST /predict HTTP/1.1" 200 OK
pred: tensor([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]) out: SequenceClassifierOutput(loss=None, logits=tensor([[ 4.4158,  2.9925, -5.0758, -3.7354, -3.8456, -3.6787, -4.6405, -5.2514,
         -4.4273, -5.0812]]), hidden_states=None, attentions=None)
INFO:     149.34.244.158:0 - "POST /predict HTTP/1.1" 2