<a href="https://colab.research.google.com/github/SShresth7272/Python-Code/blob/main/Machine_Learning_Multi_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Multiclass Classification


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_iris, load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_curve, auc, roc_auc_score,
    classification_report, confusion_matrix,
    precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.multiclass import OneVsRestClassifier




from sklearn.svm import SVC




import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")



In [12]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
print("One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo")
print("=" * 55)

def create_health_dataset():
    """Create a synthetic health diagnosis dataset"""
    print("\n1. HEALTH DOMAIN: Disease Diagnosis Prediction")
    print("-" * 45)

    # Simulate patient data: age, blood_pressure, cholesterol, blood_sugar, bmi
    X, y = make_classification(
        n_samples=1000, n_features=5, n_informative=5, n_redundant=0,
        n_classes=3, n_clusters_per_class=1, random_state=42
    )

    # Transform features to realistic ranges
    X[:, 0] = X[:, 0] * 30 + 40  # Age: 40-70
    X[:, 1] = X[:, 1] * 40 + 80  # BP: 80-160
    X[:, 2] = X[:, 2] * 100 + 150  # Cholesterol: 150-250
    X[:, 3] = X[:, 3] * 50 + 80   # Blood sugar: 80-180
    X[:, 4] = X[:, 4] * 10 + 25   # BMI: 25-35

    class_names = ['Healthy', 'Hypertension', 'Diabetes']
    feature_names = ['Age', 'Blood Pressure', 'Cholesterol', 'Blood Sugar', 'BMI']

    return X, y, class_names, feature_names, "Health Diagnosis"

def evaluate_strategies(X, y, class_names, feature_names, domain_name):
    """Evaluates OvO and OvR strategies on a given dataset"""
    print(f"\nEvaluating on {domain_name} dataset...")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # OvO Strategy
    print("Training OvO Classifier...")
    ovo_classifier = OneVsOneClassifier(LogisticRegression(solver='liblinear'))
    ovo_classifier.fit(X_train, y_train)
    ovo_score = ovo_classifier.score(X_test, y_test)
    print(f"OvO Accuracy: {ovo_score:.4f}")

    # OvR Strategy
    print("Training OvR Classifier...")
    ovr_classifier = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
    ovr_classifier.fit(X_train, y_train)
    ovr_score = ovr_classifier.score(X_test, y_test)
    print(f"OvR Accuracy: {ovr_score:.4f}")

    return ovo_score, ovr_score


# Main execution
if __name__ == "__main__":
    # Create datasets from different domains
    datasets = [
        create_health_dataset(),

    ]

    results = []

    # Evaluate each dataset
    for dataset in datasets:
        X, y, class_names, feature_names, domain_name = dataset
        ovo_score, ovr_score = evaluate_strategies(X, y, class_names, feature_names, domain_name)
        results.append({
            'Domain': domain_name,
            'OvO_Accuracy': ovo_score,
            'OvR_Accuracy': ovr_score,
            'Best_Strategy': 'OvO' if ovo_score >= ovr_score else 'OvR'
        })

    # Summary comparison
    print("\n" + "="*60)
    print("SUMMARY COMPARISON ACROSS DOMAINS")
    print("="*60)

    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

    # Overall analysis
    ovo_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvO')
    ovr_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvR')

    print(f"\nOverall Performance:")
    print(f"OvO was better in {ovo_wins} out of {len(datasets)} domains")
    print(f"OvR was better in {ovr_wins} out of {len(datasets)} domains")

One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo

1. HEALTH DOMAIN: Disease Diagnosis Prediction
---------------------------------------------

Evaluating on Health Diagnosis dataset...
Training OvO Classifier...
OvO Accuracy: 0.9400
Training OvR Classifier...
OvR Accuracy: 0.9167

SUMMARY COMPARISON ACROSS DOMAINS
          Domain  OvO_Accuracy  OvR_Accuracy Best_Strategy
Health Diagnosis          0.94      0.916667           OvO

Overall Performance:
OvO was better in 1 out of 1 domains
OvR was better in 0 out of 1 domains


In [7]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
print("One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo")
print("=" * 55)

def create_finance_dataset():
    """Create a synthetic financial risk assessment dataset"""
    print("\n2. FINANCE DOMAIN: Credit Risk Assessment")
    print("-" * 45)

    # Simulate financial data: income, credit_score, debt_to_income, employment_years, savings
    X, y = make_classification(
        n_samples=800, n_features=5, n_informative=5, n_redundant=0,
        n_classes=4, n_clusters_per_class=1, random_state=42
    )

    # Transform to realistic financial ranges
    X[:, 0] = X[:, 0] * 40000 + 50000  # Income: 50k-90k
    X[:, 1] = X[:, 1] * 200 + 600      # Credit score: 600-800
    X[:, 2] = (X[:, 2] * 0.3 + 0.2) * 100  # Debt-to-income: 20%-50%
    X[:, 3] = X[:, 3] * 10 + 5         # Employment years: 5-15
    X[:, 4] = X[:, 4] * 50000 + 20000  # Savings: 20k-70k

    class_names = ['Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk']
    feature_names = ['Income', 'Credit Score', 'Debt-to-Income Ratio', 'Employment Years', 'Savings']

    return X, y, class_names, feature_names, "Credit Risk"

    # Main execution
if __name__ == "__main__":
    # Create datasets from different domains
    datasets = [
        create_finance_dataset(),

    ]

    results = []

    # Evaluate each dataset
    for dataset in datasets:
        X, y, class_names, feature_names, domain_name = dataset
        ovo_score, ovr_score = evaluate_strategies(X, y, class_names, feature_names, domain_name)
        results.append({
            'Domain': domain_name,
            'OvO_Accuracy': ovo_score,
            'OvR_Accuracy': ovr_score,
            'Best_Strategy': 'OvO' if ovo_score >= ovr_score else 'OvR'
        })

    # Summary comparison
    print("\n" + "="*60)
    print("SUMMARY COMPARISON ACROSS DOMAINS")
    print("="*60)

    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

    # Overall analysis
    ovo_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvO')
    ovr_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvR')

    print(f"\nOverall Performance:")
    print(f"OvO was better in {ovo_wins} out of {len(datasets)} domains")
    print(f"OvR was better in {ovr_wins} out of {len(datasets)} domains")

One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo

2. FINANCE DOMAIN: Credit Risk Assessment
---------------------------------------------

Evaluating on Credit Risk dataset...
Training OvO Classifier...
OvO Accuracy: 0.7542
Training OvR Classifier...
OvR Accuracy: 0.7083

SUMMARY COMPARISON ACROSS DOMAINS
     Domain  OvO_Accuracy  OvR_Accuracy Best_Strategy
Credit Risk      0.754167      0.708333           OvO

Overall Performance:
OvO was better in 1 out of 1 domains
OvR was better in 0 out of 1 domains


In [8]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
print("One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo")
print("=" * 55)



def create_marketing_dataset():
    """Create a synthetic customer segmentation dataset"""
    print("\n3. MARKETING DOMAIN: Customer Segmentation")
    print("-" * 45)

    # Simulate customer data: age, income, spending_score, loyalty_years, online_activity
    X, y = make_classification(
        n_samples=1200, n_features=5, n_informative=5, n_redundant=0,
        n_classes=3, n_clusters_per_class=1, random_state=42
    )

    # Transform to realistic marketing ranges
    X[:, 0] = X[:, 0] * 30 + 25       # Age: 25-55
    X[:, 1] = X[:, 1] * 50000 + 50000 # Income: 50k-100k
    X[:, 2] = X[:, 2] * 50 + 50       # Spending score: 50-100
    X[:, 3] = X[:, 3] * 5 + 1         # Loyalty years: 1-6
    X[:, 4] = X[:, 4] * 20 + 10       # Online activity: 10-30 hours/month

    class_names = ['Budget Shopper', 'Quality Seeker', 'Premium Customer']
    feature_names = ['Age', 'Annual Income', 'Spending Score', 'Loyalty Years', 'Online Activity']

    return X, y, class_names, feature_names, "Customer Segmentation"

    # Evaluate each dataset
    for dataset in datasets:
        X, y, class_names, feature_names, domain_name = dataset
        ovo_score, ovr_score = evaluate_strategies(X, y, class_names, feature_names, domain_name)
        results.append({
            'Domain': domain_name,
            'OvO_Accuracy': ovo_score,
            'OvR_Accuracy': ovr_score,
            'Best_Strategy': 'OvO' if ovo_score >= ovr_score else 'OvR'
        })

        # Main execution
if __name__ == "__main__":
    # Create datasets from different domains
    datasets = [
        create_marketing_dataset(),

    ]

    # Summary comparison
    print("\n" + "="*60)
    print("SUMMARY COMPARISON ACROSS DOMAINS")
    print("="*60)

    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

    # Overall analysis
    ovo_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvO')
    ovr_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvR')

    print(f"\nOverall Performance:")
    print(f"OvO was better in {ovo_wins} out of {len(datasets)} domains")
    print(f"OvR was better in {ovr_wins} out of {len(datasets)} domains")



One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo

3. MARKETING DOMAIN: Customer Segmentation
---------------------------------------------

SUMMARY COMPARISON ACROSS DOMAINS
     Domain  OvO_Accuracy  OvR_Accuracy Best_Strategy
Credit Risk      0.754167      0.708333           OvO

Overall Performance:
OvO was better in 1 out of 1 domains
OvR was better in 0 out of 1 domains


In [10]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
print("One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo")
print("=" * 55)

def create_education_dataset():
    """Create a synthetic student performance dataset"""
    print("\n4. EDUCATION DOMAIN: Student Performance Prediction")
    print("-" * 45)

    # Simulate student data: study_hours, attendance, previous_grades, extracurricular, parental_support
    X, y = make_classification(
        n_samples=900, n_features=5, n_informative=5, n_redundant=0,
        n_classes=3, n_clusters_per_class=1, random_state=42
    )

    # Transform to realistic education ranges
    X[:, 0] = X[:, 0] * 15 + 10       # Study hours: 10-25 hrs/week
    X[:, 1] = X[:, 1] * 30 + 70       # Attendance: 70-100%
    X[:, 2] = X[:, 2] * 20 + 70       # Previous grades: 70-90%
    X[:, 3] = X[:, 3] * 10 + 5        # Extracurricular: 5-15 hrs/month
    X[:, 4] = X[:, 4] * 4 + 3         # Parental support: 3-7 (scale 1-7)

    class_names = ['Low Performance', 'Average Performance', 'High Performance']
    feature_names = ['Study Hours', 'Attendance %', 'Previous Grades', 'Extracurricular Hours', 'Parental Support']

    return X, y, class_names, feature_names, "Student Performance"

def evaluate_strategies(X, y, class_names, feature_names, domain_name):
    """Evaluate OvO vs OvR strategies for a given dataset"""

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print(f"\nDomain: {domain_name}")
    print(f"Classes: {class_names}")
    print(f"Features: {feature_names}")
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")

    # Create base classifier
    base_clf = LogisticRegression(random_state=42, max_iter=1000)

    # Apply both strategies
    ovo_clf = OneVsOneClassifier(base_clf)
    ovr_clf = OneVsRestClassifier(base_clf)

    # Train and evaluate
    ovo_clf.fit(X_train, y_train)
    ovr_clf.fit(X_train, y_train)

    # Predictions
    y_pred_ovo = ovo_clf.predict(X_test)
    y_pred_ovr = ovr_clf.predict(X_test)

    # Scores
    from sklearn.metrics import accuracy_score
    ovo_score = accuracy_score(y_test, y_pred_ovo)
    ovr_score = accuracy_score(y_test, y_pred_ovr)

    print(f"\nResults:")
    print(f"One-vs-One (OvO) Accuracy: {ovo_score:.4f}")
    print(f"One-vs-Rest (OvR) Accuracy: {ovr_score:.4f}")

    # Show detailed classification report for the better performing strategy
    best_strategy = "OvO" if ovo_score >= ovr_score else "OvR"
    best_clf = ovo_clf if ovo_score >= ovr_score else ovr_clf
    best_pred = y_pred_ovo if ovo_score >= ovr_score else y_pred_ovr

    from sklearn.metrics import classification_report
    print(f"\nBest Strategy: {best_strategy}")
    print("Detailed Classification Report:")
    print(classification_report(y_test, best_pred, target_names=class_names))

    return ovo_score, ovr_score

# Main execution
if __name__ == "__main__":
    # Create datasets from different domains
    datasets = [
        create_education_dataset()
    ]

    results = []

    # Evaluate each dataset
    for dataset in datasets:
        X, y, class_names, feature_names, domain_name = dataset
        ovo_score, ovr_score = evaluate_strategies(X, y, class_names, feature_names, domain_name)
        results.append({
            'Domain': domain_name,
            'OvO_Accuracy': ovo_score,
            'OvR_Accuracy': ovr_score,
            'Best_Strategy': 'OvO' if ovo_score >= ovr_score else 'OvR'
        })

    # Summary comparison
    print("\n" + "="*60)
    print("SUMMARY COMPARISON ACROSS DOMAINS")
    print("="*60)

    results_df = pd.DataFrame(results)
    print(results_df.to_string(index=False))

    # Overall analysis
    ovo_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvO')
    ovr_wins = sum(1 for r in results if r['Best_Strategy'] == 'OvR')

    print(f"\nOverall Performance:")
    print(f"OvO was better in {ovo_wins} out of {len(datasets)} domains")
    print(f"OvR was better in {ovr_wins} out of {len(datasets)} domains")

One-vs-One (OvO) and One-vs-Rest (OvR) Algorithms Demo

4. EDUCATION DOMAIN: Student Performance Prediction
---------------------------------------------

Domain: Student Performance
Classes: ['Low Performance', 'Average Performance', 'High Performance']
Features: ['Study Hours', 'Attendance %', 'Previous Grades', 'Extracurricular Hours', 'Parental Support']
Training samples: 630
Test samples: 270

Results:
One-vs-One (OvO) Accuracy: 0.8630
One-vs-Rest (OvR) Accuracy: 0.8481

Best Strategy: OvO
Detailed Classification Report:
                     precision    recall  f1-score   support

    Low Performance       0.80      0.79      0.80        91
Average Performance       0.86      0.80      0.83        90
   High Performance       0.93      1.00      0.96        89

           accuracy                           0.86       270
          macro avg       0.86      0.86      0.86       270
       weighted avg       0.86      0.86      0.86       270


SUMMARY COMPARISON ACROSS DOMAINS
   