<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/25_Okt_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import library

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as ana
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.calibration import calibration_curve, CalibrationDisplay, CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

## Load data

In [None]:
train_df = pd.read_csv('train.csv')
train_df.head(1)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55.0,170.0,80.0,92.0,1.2,0.8,1.0,1.0,129.0,74.0,82.0,175.0,58.0,49.0,114.0,15.4,1.0,0.9,20.0,23.0,13.0,0.0,0.0


## Split data to X/y and train/test

In [None]:
X = train_df.drop(['smoking'], axis=1)
y = train_df['smoking']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

## Functions

In [44]:
def plot_calibration_curve(scores):
    fig = go.Figure()

    for i in range(len(scores['Model'])):

        score = scores['score'][i]
        roc_score = scores['ROC AUC Score'][i]
        brier_score = scores['Brier Score'][i]
        name = scores['Model'][i]
        y_prob = scores['y_prob'][i]

        fraction_of_positives, mean_predicted_value = calibration_curve(y_test, y_prob, n_bins=10)

        fig.add_trace(go.Scatter(
            x=mean_predicted_value,
            y=fraction_of_positives,
            mode='lines+markers',
            name=f"{name} (ROC AUC: {roc_score:.2f}, Brier Score: {brier_score:.2f})"
        ))

    fig.add_trace(go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode='lines',
        line=dict(dash='dash', color='black'),
        name='Perfectly Calibrated'
    ))

    fig.update_layout(
        title="Brier Line Plot (Reliability Diagram) with ROC AUC and Brier Scores",
        xaxis_title="Mean Predicted Probability",
        yaxis_title="Fraction of Positives",
        legend_title="Models",
        showlegend=True
    )
    fig.show()

## models

### All models

#### Without Calibrated

In [45]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

scores = {
    'Model': [],
    'ROC AUC Score': [],
    'Brier Score': [],
    'score': [],
    'y_prob': []
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]

    roc_score = roc_auc_score(y_test, y_prob)
    brier_score = brier_score_loss(y_test, y_prob)

    scores['Model'].append(name)
    scores['ROC AUC Score'].append(roc_score)
    scores['Brier Score'].append(brier_score)
    scores['score'].append(model.score(X_test, y_test))
    scores['y_prob'].append(y_prob)

In [46]:
plot_calibration_curve(scores)

#### Calibrated

##### isotonic

In [50]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

scores = {
    'Model': [],
    'ROC AUC Score': [],
    'Brier Score': [],
    'score': [],
    'y_prob': []
}

stf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, model in models.items():

    calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv=stf)
    calibrated_model.fit(X_train, y_train)

    calibrated_model.fit(X_train, y_train)
    y_prob = calibrated_model.predict_proba(X_test)[:, 1]

    roc_score = roc_auc_score(y_test, y_prob)
    brier_score = brier_score_loss(y_test, y_prob)

    scores['Model'].append(name)
    scores['ROC AUC Score'].append(roc_score)
    scores['Brier Score'].append(brier_score)
    scores['score'].append(calibrated_model.score(X_test, y_test))
    scores['y_prob'].append(y_prob)

In [51]:
plot_calibration_curve(scores)

###### sigmoid

In [52]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier()
}

scores = {
    'Model': [],
    'ROC AUC Score': [],
    'Brier Score': [],
    'score': [],
    'y_prob': []
}

stf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, model in models.items():

    calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv=stf)
    calibrated_model.fit(X_train, y_train)

    calibrated_model.fit(X_train, y_train)
    y_prob = calibrated_model.predict_proba(X_test)[:, 1]

    roc_score = roc_auc_score(y_test, y_prob)
    brier_score = brier_score_loss(y_test, y_prob)

    scores['Model'].append(name)
    scores['ROC AUC Score'].append(roc_score)
    scores['Brier Score'].append(brier_score)
    scores['score'].append(calibrated_model.score(X_test, y_test))
    scores['y_prob'].append(y_prob)

In [53]:
plot_calibration_curve(scores)

#### I mean calebration didnt help us this time and best model is `RandomForestClassifier` with best *ROC AUC* and *Brier Score*.