In [1]:
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv')

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [5]:
y_scores = model.predict_proba(X_test)[:,1]


In [6]:
y_scores

array([0.04957932, 0.17463484, 0.09367763, 0.25510784, 0.63546833,
       0.11680904, 0.06569988, 0.42189687, 0.0487055 , 0.57569143,
       0.33867418, 0.41317289, 0.69847511, 0.19970575, 0.02002726,
       0.82458296, 0.86655291, 0.03103208, 0.25530347, 0.89486657,
       0.9523918 , 0.83469859, 0.11751157, 0.44676658, 0.08928539,
       0.06890459, 0.65130532, 0.41190652, 0.17870247, 0.28701003,
       0.24548669, 0.43902171, 0.00943896, 0.24267043, 0.35150976,
       0.9616    , 0.3437475 , 0.80929238, 0.29380048, 0.05088382,
       0.18193177, 0.08676785, 0.42239476, 0.18745764, 0.03012399,
       0.0395244 , 0.2474972 , 0.42776404, 0.10534941, 0.37963509,
       0.99352847, 0.10660251, 0.39093034, 0.76873514, 0.36857954,
       0.45769798, 0.95032025, 0.41999076, 0.39131141, 0.04379634,
       0.37279276, 0.90128847, 0.88174336, 0.88840373, 0.33088018,
       0.06928764, 0.95265535, 0.20053913, 0.26050497, 0.35454728,
       0.10359023, 0.08170279, 0.45052518, 0.0922266 , 0.08654

In [7]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_scores)

In [8]:
thresholds

array([1.99352847e+00, 9.93528470e-01, 9.52655353e-01, 9.52391802e-01,
       8.24582961e-01, 7.68735136e-01, 7.61710686e-01, 6.83824505e-01,
       6.75345243e-01, 6.74059920e-01, 6.51305321e-01, 6.50528225e-01,
       6.35468326e-01, 5.86335863e-01, 5.75691429e-01, 5.69682440e-01,
       5.25144330e-01, 4.70188931e-01, 4.61170583e-01, 4.50525177e-01,
       4.39021711e-01, 4.38728574e-01, 4.22394762e-01, 4.19990764e-01,
       4.13172889e-01, 4.11906517e-01, 4.04992543e-01, 3.79635094e-01,
       3.68579543e-01, 3.10300744e-01, 3.08816002e-01, 2.77882255e-01,
       2.72173439e-01, 2.55303468e-01, 2.55107844e-01, 2.47497199e-01,
       2.45486693e-01, 2.36819332e-01, 2.28797971e-01, 1.81931772e-01,
       1.78702467e-01, 1.74634842e-01, 1.73288391e-01, 1.46518300e-01,
       1.40797877e-01, 1.16809044e-01, 1.14939679e-01, 1.10067250e-01,
       1.06602513e-01, 8.92853857e-02, 8.67678530e-02, 2.00272606e-02,
       1.93209074e-02, 1.61018630e-03])

In [9]:
import plotly.graph_objects as go
import numpy as np


# Generate a trace for ROC curve
trace0 = go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name='ROC curve'
)

# Only label every nth point to avoid cluttering
n = 10
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace1 = go.Scatter(
    x=fpr[indices],
    y=tpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr:.2f}" for thr in thresholds[indices]],
    textposition='top center'
)


# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=False
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()


In [10]:
# Assume that fpr, tpr, thresholds have already been calculated
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold is:", optimal_threshold)


Optimal threshold is: 0.36857954341437343


In [11]:
import plotly.graph_objects as go
import numpy as np
from sklearn.metrics import roc_auc_score

# Assuming fpr, tpr, thresholds are already calculated as before
fpr, tpr, thresholds = roc_curve(y_test, y_scores)

# Calculate the AUC (Area Under the Curve)
roc_auc = roc_auc_score(y_test, y_scores)

# Generate a trace for ROC curve
trace0 = go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC curve (Area = {roc_auc:.2f})'
)

# Only label every nth point to avoid cluttering
n = 10
indices = np.arange(len(thresholds)) % n == 0  # Choose indices where index mod n is 0

trace1 = go.Scatter(
    x=fpr[indices],
    y=tpr[indices],
    mode='markers+text',
    name='Threshold points',
    text=[f"Thr={thr:.2f}" for thr in thresholds[indices]],
    textposition='top center'
)

# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()


In [None]:
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming that X_train, X_test, y_train, y_test are already defined

# SVM requires feature scaling for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_scores = lr_model.predict_proba(X_test)[:,1]

# SVM model
svm_model = SVC(probability=True)
svm_model.fit(X_train_scaled, y_train)
svm_scores = svm_model.predict_proba(X_test_scaled)[:,1]

# Generate ROC curve data for logistic regression model
lr_fpr, lr_tpr, lr_thresholds = roc_curve(y_test, lr_scores)
lr_auc = roc_auc_score(y_test, lr_scores)

# Generate ROC curve data for SVM model
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_scores)
svm_auc = roc_auc_score(y_test, svm_scores)

# Generate a trace for the Logistic Regression ROC curve
trace0 = go.Scatter(
    x=lr_fpr,
    y=lr_tpr,
    mode='lines',
    name=f'Logistic Regression (Area = {lr_auc:.2f})'
)

# Generate a trace for the SVM ROC curve
trace1 = go.Scatter(
    x=svm_fpr,
    y=svm_tpr,
    mode='lines',
    name=f'SVM (Area = {svm_auc:.2f})'
)

# Diagonal line
trace2 = go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (Area = 0.5)',
    line=dict(dash='dash')
)

data = [trace0, trace1, trace2]

# Define layout with square aspect ratio
layout = go.Layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    autosize=False,
    width=800,
    height=800,
    showlegend=True
)

# Define figure and add data
fig = go.Figure(data=data, layout=layout)

# Show figure
fig.show()
