In [3]:
# Install required packages
!pip install -q transformers scikit-learn pandas torch tqdm

import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# DATA LOADING AND PREPARATION
# =============================================================================

def load_and_prepare_data():
    """Load the data and prepare it for classification"""
    print("📂 Loading data...")

    # Load both datasets
    df_slang = pd.read_csv("/content/slang_OpenSub_filtered.tsv", sep="\t", dtype=str)
    df_nonslang = pd.read_csv("/content/slang_OpenSub_negatives_filtered.tsv", sep="\t", dtype=str)

    # Add labels and confidence
    df_slang["label"] = 1  # slang
    df_slang["confidence"] = df_slang["ANNOTATOR_CONFIDENCE"].astype(int)

    df_nonslang["label"] = 0  # non-slang
    df_nonslang["confidence"] = 0  # non-slang doesn't have confidence ratings

    # Combine datasets
    df_all = pd.concat([df_slang, df_nonslang], ignore_index=True)

    # Extract sentences and labels
    sentences = df_all["SENTENCE"].tolist()
    true_labels = df_all["label"].tolist()
    confidence_levels = df_all["confidence"].tolist()

    print(f"✅ Data loaded successfully!")
    print(f"📊 Total sentences: {len(sentences)}")
    print(f"📊 Slang sentences: {sum(true_labels)}")
    print(f"📊 Non-slang sentences: {len(true_labels) - sum(true_labels)}")

    # Print confidence distribution for slang sentences
    slang_confidence = df_slang["confidence"].value_counts().sort_index()
    print(f"📊 Slang confidence distribution:")
    for conf, count in slang_confidence.items():
        print(f"   Confidence {conf}: {count} sentences")

    return sentences, true_labels, confidence_levels

# =============================================================================
# ZERO-SHOT CLASSIFICATION FUNCTIONS
# =============================================================================

def build_classifier(model_name):
    """Build tokenizer and model for zero-shot classification"""
    print(f"🔧 Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()
    print(f"🔧 Model loaded on: {device}")
    return tokenizer, model, device

def zero_shot_classify_sentence(tokenizer, model, device, sentence):
    """Classify a single sentence using zero-shot NLI approach"""
    candidate_labels = ["slang", "non-slang"]
    hypotheses = [f"This example is {label}." for label in candidate_labels]

    # Tokenize
    encoded = tokenizer(
        [sentence] * len(hypotheses),
        hypotheses,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoded)
        logits = outputs.logits
        # For MNLI models, entailment is typically the last class (index 2)
        entail_logits = logits[:, 2]
        probs = F.softmax(entail_logits, dim=0)
        best_idx = torch.argmax(probs).item()

    # Return 1 if "slang" is predicted, 0 if "non-slang"
    return 1 if candidate_labels[best_idx] == "slang" else 0

def evaluate_model(model_name, sentences, true_labels, confidence_levels):
    """Evaluate a single model and return detailed metrics"""
    print(f"\n{'='*60}")
    print(f"🔍 Evaluating: {model_name}")
    print(f"{'='*60}")

    # Build classifier
    tokenizer, model, device = build_classifier(model_name)

    # Run predictions
    predictions = []
    print("🚀 Running predictions...")
    for sentence in tqdm(sentences, desc="Classifying"):
        pred = zero_shot_classify_sentence(tokenizer, model, device, sentence)
        predictions.append(pred)

    # Calculate overall metrics
    overall_accuracy = accuracy_score(true_labels, predictions)

    # Calculate per-class metrics
    slang_indices = [i for i, label in enumerate(true_labels) if label == 1]
    nonslang_indices = [i for i, label in enumerate(true_labels) if label == 0]

    slang_predictions = [predictions[i] for i in slang_indices]
    slang_true = [true_labels[i] for i in slang_indices]
    slang_accuracy = accuracy_score(slang_true, slang_predictions)

    nonslang_predictions = [predictions[i] for i in nonslang_indices]
    nonslang_true = [true_labels[i] for i in nonslang_indices]
    nonslang_accuracy = accuracy_score(nonslang_true, nonslang_predictions)

    # Calculate confidence-based metrics for slang sentences
    confidence_metrics = {}
    for conf_level in [1, 2, 3]:
        conf_indices = [i for i in slang_indices if confidence_levels[i] == conf_level]
        if conf_indices:
            conf_predictions = [predictions[i] for i in conf_indices]
            conf_true = [true_labels[i] for i in conf_indices]
            conf_accuracy = accuracy_score(conf_true, conf_predictions)
            confidence_metrics[conf_level] = {
                'accuracy': conf_accuracy,
                'total_sentences': len(conf_indices),
                'correct_predictions': sum(conf_predictions)
            }
        else:
            confidence_metrics[conf_level] = {
                'accuracy': 0.0,
                'total_sentences': 0,
                'correct_predictions': 0
            }

    # Generate confusion matrix
    cm = confusion_matrix(true_labels, predictions)

    # Create detailed classification report
    class_report = classification_report(
        true_labels,
        predictions,
        target_names=['Non-Slang', 'Slang'],
        output_dict=True
    )

    results = {
        'model_name': model_name,
        'overall_accuracy': overall_accuracy,
        'slang_accuracy': slang_accuracy,
        'nonslang_accuracy': nonslang_accuracy,
        'confidence_metrics': confidence_metrics,
        'confusion_matrix': cm,
        'classification_report': class_report,
        'predictions': predictions
    }

    return results

# =============================================================================
# RESULTS DISPLAY FUNCTIONS
# =============================================================================

def print_model_results(results):
    """Print detailed results for a single model"""
    print(f"\n📈 RESULTS FOR: {results['model_name']}")
    print("-" * 50)

    print(f"🎯 Overall Accuracy: {results['overall_accuracy']:.4f}")
    print(f"🔥 Slang Accuracy: {results['slang_accuracy']:.4f}")
    print(f"❄️  Non-Slang Accuracy: {results['nonslang_accuracy']:.4f}")

    print(f"\n📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):")
    for conf_level in [1, 2, 3]:
        metrics = results['confidence_metrics'][conf_level]
        if metrics['total_sentences'] > 0:
            print(f"   Confidence {conf_level}: {metrics['accuracy']:.4f} "
                  f"({metrics['correct_predictions']}/{metrics['total_sentences']} sentences)")
        else:
            print(f"   Confidence {conf_level}: No sentences found")

    print(f"\n📋 CONFUSION MATRIX:")
    cm = results['confusion_matrix']
    print(f"                Predicted")
    print(f"              Non-Slang  Slang")
    print(f"Actual Non-Slang  {cm[0,0]:4d}    {cm[0,1]:4d}")
    print(f"       Slang      {cm[1,0]:4d}    {cm[1,1]:4d}")

    print(f"\n📋 DETAILED CLASSIFICATION REPORT:")
    report = results['classification_report']
    print(f"              Precision  Recall  F1-Score  Support")
    print(f"Non-Slang        {report['Non-Slang']['precision']:.3f}   {report['Non-Slang']['recall']:.3f}    {report['Non-Slang']['f1-score']:.3f}     {report['Non-Slang']['support']}")
    print(f"Slang            {report['Slang']['precision']:.3f}   {report['Slang']['recall']:.3f}    {report['Slang']['f1-score']:.3f}     {report['Slang']['support']}")
    print(f"Macro Avg        {report['macro avg']['precision']:.3f}   {report['macro avg']['recall']:.3f}    {report['macro avg']['f1-score']:.3f}     {report['macro avg']['support']}")
    print(f"Weighted Avg     {report['weighted avg']['precision']:.3f}   {report['weighted avg']['recall']:.3f}    {report['weighted avg']['f1-score']:.3f}     {report['weighted avg']['support']}")

def create_summary_table(all_results):
    """Create a summary table comparing all models"""
    print(f"\n{'='*80}")
    print(f"📊 SUMMARY COMPARISON OF ALL MODELS")
    print(f"{'='*80}")

    print(f"{'Model':<25} {'Overall':<8} {'Slang':<8} {'Non-Slang':<10} {'Conf-1':<8} {'Conf-2':<8} {'Conf-3':<8}")
    print("-" * 80)

    for results in all_results:
        model_short = results['model_name'].split('/')[-1][:20]
        conf1_acc = results['confidence_metrics'][1]['accuracy']
        conf2_acc = results['confidence_metrics'][2]['accuracy']
        conf3_acc = results['confidence_metrics'][3]['accuracy']

        print(f"{model_short:<25} {results['overall_accuracy']:<8.4f} {results['slang_accuracy']:<8.4f} "
              f"{results['nonslang_accuracy']:<10.4f} {conf1_acc:<8.4f} {conf2_acc:<8.4f} {conf3_acc:<8.4f}")

def create_detailed_summary(all_results):
    """Create a detailed summary with additional insights"""
    print(f"\n{'='*80}")
    print(f"🔍 DETAILED ANALYSIS")
    print(f"{'='*80}")

    # Find best performing models
    best_overall = max(all_results, key=lambda x: x['overall_accuracy'])
    best_slang = max(all_results, key=lambda x: x['slang_accuracy'])
    best_nonslang = max(all_results, key=lambda x: x['nonslang_accuracy'])

    print(f"🏆 Best Overall Accuracy: {best_overall['model_name']} ({best_overall['overall_accuracy']:.4f})")
    print(f"🔥 Best Slang Detection: {best_slang['model_name']} ({best_slang['slang_accuracy']:.4f})")
    print(f"❄️  Best Non-Slang Detection: {best_nonslang['model_name']} ({best_nonslang['nonslang_accuracy']:.4f})")

    # Confidence analysis
    print(f"\n📊 CONFIDENCE LEVEL ANALYSIS:")
    for conf_level in [1, 2, 3]:
        accuracies = [r['confidence_metrics'][conf_level]['accuracy']
                     for r in all_results
                     if r['confidence_metrics'][conf_level]['total_sentences'] > 0]
        if accuracies:
            avg_acc = np.mean(accuracies)
            best_conf_model = max(all_results,
                                key=lambda x: x['confidence_metrics'][conf_level]['accuracy'])
            print(f"   Confidence {conf_level}: Avg={avg_acc:.4f}, "
                  f"Best={best_conf_model['model_name']} ({best_conf_model['confidence_metrics'][conf_level]['accuracy']:.4f})")

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution function"""
    print("🚀 Starting Slang Classification Analysis")
    print("=" * 60)

    # Load data
    sentences, true_labels, confidence_levels = load_and_prepare_data()

    # Define models to evaluate
    models = {
        "BART-large-MNLI": "facebook/bart-large-mnli",
        "BERT-base-MNLI": "textattack/bert-base-uncased-MNLI",
        "RoBERTa-large-MNLI": "roberta-large-mnli",
        "DeBERTa-v3-large-MNLI": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
    }

    # Evaluate all models
    all_results = []
    for model_name, model_id in models.items():
        try:
            results = evaluate_model(model_id, sentences, true_labels, confidence_levels)
            results['model_name'] = model_name  # Use friendly name
            all_results.append(results)
            print_model_results(results)
        except Exception as e:
            print(f"❌ Error evaluating {model_name}: {str(e)}")
            continue

    # Create summary comparisons
    if all_results:
        create_summary_table(all_results)
        create_detailed_summary(all_results)

        # Save results to CSV for further analysis
        print(f"\n💾 Saving detailed results...")

        # Create a comprehensive results DataFrame
        results_data = []
        for results in all_results:
            base_data = {
                'model': results['model_name'],
                'overall_accuracy': results['overall_accuracy'],
                'slang_accuracy': results['slang_accuracy'],
                'nonslang_accuracy': results['nonslang_accuracy'],
            }

            # Add confidence-based metrics
            for conf_level in [1, 2, 3]:
                conf_metrics = results['confidence_metrics'][conf_level]
                base_data[f'confidence_{conf_level}_accuracy'] = conf_metrics['accuracy']
                base_data[f'confidence_{conf_level}_total'] = conf_metrics['total_sentences']
                base_data[f'confidence_{conf_level}_correct'] = conf_metrics['correct_predictions']

            # Add classification metrics
            base_data['slang_precision'] = results['classification_report']['Slang']['precision']
            base_data['slang_recall'] = results['classification_report']['Slang']['recall']
            base_data['slang_f1'] = results['classification_report']['Slang']['f1-score']
            base_data['nonslang_precision'] = results['classification_report']['Non-Slang']['precision']
            base_data['nonslang_recall'] = results['classification_report']['Non-Slang']['recall']
            base_data['nonslang_f1'] = results['classification_report']['Non-Slang']['f1-score']

            results_data.append(base_data)

        results_df = pd.DataFrame(results_data)
        results_df.to_csv('/content/slang_classification_results.csv', index=False)
        print(f"✅ Results saved to '/content/slang_classification_results.csv'")

        print(f"\n🎉 Analysis completed successfully!")
        print(f"📊 Evaluated {len(all_results)} models on {len(sentences)} sentences")

    else:
        print("❌ No models were successfully evaluated")

# Run the analysis
if __name__ == "__main__":
    main()

🚀 Starting Slang Classification Analysis
📂 Loading data...
✅ Data loaded successfully!
📊 Total sentences: 3617
📊 Slang sentences: 1424
📊 Non-slang sentences: 2193
📊 Slang confidence distribution:
   Confidence 1: 989 sentences
   Confidence 2: 361 sentences
   Confidence 3: 74 sentences

🔍 Evaluating: facebook/bart-large-mnli
🔧 Loading model: facebook/bart-large-mnli


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

🔧 Model loaded on: cuda
🚀 Running predictions...


Classifying:   0%|          | 0/3617 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.



📈 RESULTS FOR: BART-large-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.6536
🔥 Slang Accuracy: 0.2247
❄️  Non-Slang Accuracy: 0.9321

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.1982 (196/989 sentences)
   Confidence 2: 0.2576 (93/361 sentences)
   Confidence 3: 0.4189 (31/74 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang  2044     149
       Slang      1104     320

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.649   0.932    0.765     2193.0
Slang            0.682   0.225    0.338     1424.0
Macro Avg        0.666   0.578    0.552     3617.0
Weighted Avg     0.662   0.654    0.597     3617.0

🔍 Evaluating: textattack/bert-base-uncased-MNLI
🔧 Loading model: textattack/bert-base-uncased-MNLI


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

🔧 Model loaded on: cuda
🚀 Running predictions...


Classifying:   0%|          | 0/3617 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


📈 RESULTS FOR: BERT-base-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.4075
🔥 Slang Accuracy: 0.9867
❄️  Non-Slang Accuracy: 0.0315

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.9869 (976/989 sentences)
   Confidence 2: 0.9861 (356/361 sentences)
   Confidence 3: 0.9865 (73/74 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang    69    2124
       Slang        19    1405

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.784   0.031    0.060     2193.0
Slang            0.398   0.987    0.567     1424.0
Macro Avg        0.591   0.509    0.314     3617.0
Weighted Avg     0.632   0.408    0.260     3617.0

🔍 Evaluating: roberta-large-mnli
🔧 Loading model: roberta-large-mnli


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


🔧 Model loaded on: cuda
🚀 Running predictions...


Classifying:   0%|          | 0/3617 [00:00<?, ?it/s]


📈 RESULTS FOR: RoBERTa-large-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.4487
🔥 Slang Accuracy: 0.9221
❄️  Non-Slang Accuracy: 0.1414

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.9070 (897/989 sentences)
   Confidence 2: 0.9529 (344/361 sentences)
   Confidence 3: 0.9730 (72/74 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang   310    1883
       Slang       111    1313

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.736   0.141    0.237     2193.0
Slang            0.411   0.922    0.568     1424.0
Macro Avg        0.574   0.532    0.403     3617.0
Weighted Avg     0.608   0.449    0.368     3617.0

🔍 Evaluating: MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli
🔧 Loading model: MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli


tokenizer_config.json:   0%|          | 0.00/395 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/870M [00:00<?, ?B/s]

🔧 Model loaded on: cuda
🚀 Running predictions...


Classifying:   0%|          | 0/3617 [00:00<?, ?it/s]


📈 RESULTS FOR: DeBERTa-v3-large-MNLI
--------------------------------------------------
🎯 Overall Accuracy: 0.5939
🔥 Slang Accuracy: 0.0105
❄️  Non-Slang Accuracy: 0.9726

📊 CONFIDENCE-BASED ACCURACY (Slang sentences only):
   Confidence 1: 0.0131 (13/989 sentences)
   Confidence 2: 0.0055 (2/361 sentences)
   Confidence 3: 0.0000 (0/74 sentences)

📋 CONFUSION MATRIX:
                Predicted
              Non-Slang  Slang
Actual Non-Slang  2133      60
       Slang      1409      15

📋 DETAILED CLASSIFICATION REPORT:
              Precision  Recall  F1-Score  Support
Non-Slang        0.602   0.973    0.744     2193.0
Slang            0.200   0.011    0.020     1424.0
Macro Avg        0.401   0.492    0.382     3617.0
Weighted Avg     0.444   0.594    0.459     3617.0

📊 SUMMARY COMPARISON OF ALL MODELS
Model                     Overall  Slang    Non-Slang  Conf-1   Conf-2   Conf-3  
--------------------------------------------------------------------------------
BART-large-MNLI     