### Step 1: Environment Setup & Imports

This cell installs necessary libraries including Transformers and Scikit-Learn, imports required modules for data processing and modeling, and sets random seeds to ensure reproducibility of results.

In [None]:
!pip install -Uq protobuf==3.20.3
!pip install -Uq transformers==4.48.0 accelerate==0.27.0 peft==0.8.2 sentencepiece emoji datasets scikit-learn seaborn

import os
import re
import html
import json
import emoji
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    confusion_matrix, 
    classification_report,
    matthews_corrcoef
)
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset

SAMPLE_SIZE = 1000000
MODEL_CHECKPOINT = "microsoft/deberta-v3-base" 
RANDOM_SEED = 42

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)
print("‚úÖ System Initialized. Ready for 1M Run.")

### Step 2: Data Loading & Sampling

This cell loads the Sentiment140 dataset, performs stratified sampling to create a balanced dataset of 1 million samples (500k positive, 500k negative), maps sentiment labels to binary values, and optimizes memory usage by deleting unused dataframes.

In [None]:
print(f">> Loading Data (Target: {SAMPLE_SIZE:,} rows)...")

INPUT_FILE = ""
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if 'training' in filename and 'csv' in filename:
            INPUT_FILE = os.path.join(dirname, filename)
            break

if not INPUT_FILE:
    raise FileNotFoundError("‚ùå ERROR: Add 'sentiment140' dataset to Kaggle!")

try:
    df_full = pd.read_csv(
        INPUT_FILE, 
        encoding='latin-1', 
        header=None, 
        names=['sentiment', 'id', 'date', 'flag', 'user', 'text']
    )
except Exception as e:
    print(f"Error: {e}")
    exit(1)

neg_df = df_full[df_full['sentiment'] == 0]
pos_df = df_full[df_full['sentiment'] == 4]

half_sample = SAMPLE_SIZE // 2
neg_sample = neg_df.sample(n=half_sample, random_state=RANDOM_SEED)
pos_sample = pos_df.sample(n=half_sample, random_state=RANDOM_SEED)

df = pd.concat([neg_sample, pos_sample])
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

del df_full, neg_df, pos_df, neg_sample, pos_sample
import gc
gc.collect()

print(f"‚úÖ Data Loaded. Shape: {df.shape}")
print(df['sentiment'].value_counts())

### Step 3: Data Preprocessing & Splitting

This cell defines a text cleaning function to remove HTML tags, URLs, and user mentions while preserving emojis. It then applies this cleaning to the dataset and splits it into training, validation, and a held-out test set for final evaluation.

In [None]:
print(">> Applying Advanced Sanitization (Demojization + Cleaning)...")

URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+')
USER_PATTERN = re.compile(r'@\w+')

def clean_text(text):
    text = str(text)
    text = html.unescape(text)
    text = emoji.demojize(text)
    text = URL_PATTERN.sub('', text)
    text = USER_PATTERN.sub('', text)
    return text.strip()

df['text'] = df['text'].apply(clean_text)

test_size = 10000
test_df = df.tail(test_size).copy()
train_val_df = df.iloc[:-test_size]

X_train, X_val, y_train, y_val = train_test_split(
    train_val_df['text'], 
    train_val_df['sentiment'], 
    test_size=0.05, 
    stratify=train_val_df['sentiment'], 
    random_state=RANDOM_SEED
)

print(f"‚úÖ Preprocessing Done.")
print(f"   Train Corpus: {len(X_train):,}")
print(f"   Val Corpus:   {len(X_val):,}")
print(f"   Test Corpus:  {len(test_df):,}")

### Step 4: Baseline Model Training

This cell establishes a baseline performance metric using a TF-IDF vectorizer and a Logistic Regression classifier. This provides a benchmark to compare against the deep learning model's performance.

In [None]:
print(">> Training TF-IDF + Logistic Regression Baseline (1M Rows)...")

baseline_pipeline = make_pipeline(
    TfidfVectorizer(max_features=25000, ngram_range=(1,2)),
    LogisticRegression(max_iter=1000, solver='liblinear')
)

baseline_pipeline.fit(X_train, y_train)
base_preds = baseline_pipeline.predict(test_df['text'])

base_acc = accuracy_score(test_df['sentiment'], base_preds)
base_mcc = matthews_corrcoef(test_df['sentiment'], base_preds)

print(f"\n BASELINE RESULTS:")
print(f"   Accuracy: {base_acc:.4f}")
print(f"   MCC:      {base_mcc:.4f}")
print(">> Note: DeBERTa is expected to significantly outperform this.")

### Step 5: Tokenization

This cell initializes the DeBERTa V3 tokenizer and processes the text data by truncating sequences to a maximum length. It then converts the pandas DataFrames into Hugging Face Datasets and prepares a data collator for efficient batching.

In [None]:
print(f">> Loading Tokenizer: {MODEL_CHECKPOINT}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=False)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128)

train_dset = Dataset.from_dict({'text': X_train, 'label': y_train})
val_dset = Dataset.from_dict({'text': X_val, 'label': y_val})
test_dset_hf = Dataset.from_dict({'text': test_df['text'], 'label': test_df['sentiment']})

print(">> Tokenizing 1M rows (This may take 2-3 mins)...")
train_tokenized = train_dset.map(tokenize_function, batched=True)
val_tokenized = val_dset.map(tokenize_function, batched=True)
test_tokenized = test_dset_hf.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("‚úÖ Data ready for Tensor Cores.")

### Step 6: Model Configuration

This cell loads the pre-trained DeBERTa V3 model for sequence classification. It configures the training parameters, including batch size, learning rate, and evaluation strategy, and initializes the Trainer object to manage the training process.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    
    num_train_epochs=2,              
    per_device_train_batch_size=32,  
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=64,
    
    learning_rate=2e-5,              
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    lr_scheduler_type="cosine",      
    warmup_ratio=0.1,
    
    eval_strategy="steps",
    eval_steps=3000,
    save_strategy="steps",
    save_steps=3000,
    save_total_limit=1,              
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    fp16=True,
    report_to="none",
    seed=RANDOM_SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
print("‚úÖ Trainer Configured.")

### Step 7: Model Training

This cell initiates the model training process using the configured Trainer. After training completes, it saves the fine-tuned model and tokenizer to the specified directory for future use.

In [None]:
print(" STARTING 1 MILLION ROW TRAINING LOOP...")
print(">> WARNING: This will take ~4 hours. Keep the tab active!")

trainer.train()

save_path = "/kaggle/working/saved_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f" Enterprise Model Saved to {save_path}")

### Step 8: Model Evaluation

This cell generates predictions on the held-out test set using the fine-tuned model. It calculates key performance metrics such as Accuracy, F1 Score, Precision, Recall, and Matthews Correlation Coefficient (MCC) to evaluate the model's effectiveness.

In [None]:
print(">> Running Inference on Held-out Test Set...")
preds_logits = trainer.predict(test_tokenized)
preds = np.argmax(preds_logits.predictions, axis=-1)

acc = accuracy_score(test_df['sentiment'], preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_df['sentiment'], preds, average='binary')
mcc = matthews_corrcoef(test_df['sentiment'], preds)

print(f"\nüèÜ FINAL RESULTS (1 MILLION DATASET)")
print(f"{'='*30}")
print(f"Baseline MCC:      {base_mcc:.4f}")
print(f"DeBERTa V3 MCC:    {mcc:.4f}  (Improvement: +{mcc-base_mcc:.4f})")
print(f"{'-'*30}")
print(f"Accuracy:          {acc:.4f}")
print(f"F1 Score:          {f1:.4f}")
print(f"Precision:         {precision:.4f}")
print(f"Recall:            {recall:.4f}")
print(f"{'='*30}")

metrics = {"accuracy": acc, "f1": f1, "mcc": mcc, "baseline_mcc": base_mcc}
with open("final_metrics.json", "w") as f:
    json.dump(metrics, f)

### Step 9: Visualization & Error Analysis

This cell visualizes the model's performance using a confusion matrix heatmap. It also performs a qualitative error analysis by displaying examples of false positives and false negatives to understand where the model misclassified samples.

In [None]:
cm = confusion_matrix(test_df['sentiment'], preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', 
            xticklabels=['Negative', 'Positive'], 
            yticklabels=['Negative', 'Positive'])
plt.title('DeBERTa V3 (1M) Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix.png')
plt.show()

print(f"\nüîç QUALITATIVE ERROR ANALYSIS")
test_df['pred'] = preds
errors = test_df[test_df['sentiment'] != test_df['pred']]

print(f"Total Errors: {len(errors)}")
print("\n[False Positives] Predicted Positive, Actually Negative:")
for txt in errors[errors['pred'] == 1]['text'].head(3):
    print(f" üî¥ {txt}")

print("\n[False Negatives] Predicted Negative, Actually Positive:")
for txt in errors[errors['pred'] == 0]['text'].head(3):
    print(f" üîµ {txt}")

### Step 10: Deployment Dashboard

To run the interactive dashboard, execute the following command in your terminal:
`python app.py`

This will launch a local Gradio web interface where you can test the model with custom inputs.