In [1]:
import sklearn
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

pio.renderers.default = "notebook"


In [19]:
filename = 'diabetes.csv'
df = pd.read_csv(filename)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [26]:
fig = go.Figure()
for x in df.loc[:, ['BloodPressure', 'SkinThickness', 'BMI', 'Age', 'Insulin']].columns:
    fig.add_trace(go.Box(y = df[x],  name = x))
fig.update_layout(title = 'boxplot of certain features within the data')
fig.show()

As you can see there are some outliers. I believe it is important to explain clarify some of the features in order to understand whether those particular points are outliers or not

BMI- Body Mass Index is a metric used to calculate whether, based of height and weight,  to work out whether you are underweight, normal, overweight, obese or extremely obese. To calculate BMI the formula is $weight(kg)/height(m)^2$. Therefore when looking at the outlier, where the BMI is 0. This would suggest that the weight is zero kg for which is not possible given the age of some of these individuals and therefore. There is a case to remove this information from the dataframe as it is unrealistic. 

Extremely low/zero Bloodpressure (diastolic blood pressure) is possible some cases (source). Therefore there is a case to keep this information





sources used above: https://www.theiaforum.org/article.asp?issn=2589-7934;year=2016;volume=17;issue=1;spage=32;epage=33;aulast=Choudhary#:~:text=Extremely%20low%20or%20zero%20DBP,arteriovenous%20malformation%2C%20and%20aortic%20dissection.

In [27]:
removal_outlier_bmi = df[(df.BMI != 0)]
removal_outlier_skin_thickness  = removal_outlier_bmi[removal_outlier_bmi.SkinThickness != 0]
data = removal_outlier_skin_thickness.copy()
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


In [28]:
data.Outcome.value_counts()

0    359
1    180
Name: Outcome, dtype: int64

In [30]:
# using SMOTE to resample the data
from imblearn.over_sampling import SMOTE

# splitting the data
X = data.loc[:, :'Age']
y = data.loc[:, 'Outcome']

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
resampled_data = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis= 1)
resampled_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.600000,0.627000,50,1
1,1,85,66,29,0,26.600000,0.351000,31,0
2,1,89,66,23,94,28.100000,0.167000,21,0
3,0,137,40,35,168,43.100000,2.288000,33,1
4,3,78,50,32,88,31.000000,0.248000,26,1
...,...,...,...,...,...,...,...,...,...
713,3,78,52,31,86,31.261101,0.333184,26,1
714,2,129,77,23,78,28.442516,0.325250,33,1
715,0,175,93,27,0,34.739526,0.730110,47,1
716,2,127,84,36,110,35.450921,0.891479,37,1


In [35]:
import datacompy


compare = datacompy.Compare(
data,
resampled_data,
join_columns=list(df.columns), #You can also specify a list of columns
abs_tol=0.0001,
rel_tol=0,
df1_name='original',
df2_name='resampled')
print(compare.report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

   DataFrame  Columns  Rows
0   original        9   539
1  resampled        9   718

Column Summary
--------------

Number of columns in common: 9
Number of columns in original but not in resampled: 0
Number of columns in resampled but not in original: 0

Row Summary
-----------

Matched on: pregnancies, glucose, bloodpressure, skinthickness, insulin, bmi, diabetespedigreefunction, age, outcome
Any duplicates on match values: No
Absolute Tolerance: 0.0001
Relative Tolerance: 0
Number of rows in common: 539
Number of rows in original but not in resampled: 0
Number of rows in resampled but not in original: 179

Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 539

Column Comparison
-----------------

Number of columns compared with some values unequal: 0
Number of columns compared with all values equal: 9
Total number of values which compare unequal: 0

Samp

In [41]:
resampled_data

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.600000,0.627000,50,1
1,1,85,66,29,0,26.600000,0.351000,31,0
2,1,89,66,23,94,28.100000,0.167000,21,0
3,0,137,40,35,168,43.100000,2.288000,33,1
4,3,78,50,32,88,31.000000,0.248000,26,1
...,...,...,...,...,...,...,...,...,...
713,3,78,52,31,86,31.261101,0.333184,26,1
714,2,129,77,23,78,28.442516,0.325250,33,1
715,0,175,93,27,0,34.739526,0.730110,47,1
716,2,127,84,36,110,35.450921,0.891479,37,1


In [43]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = resampled_data.bmi, y = resampled_data.glucose, mode = 'markers', name = 'hello'))
fig.update_layout(title = 'relationship between BMI and Glucose',
        xaxis_title='BMI',
        yaxis_title='Glucose', 
        legend_title = 'markers')
fig.show()

In [46]:
X = resampled_data.loc[:, :'age']
y = resampled_data.loc[:, 'outcome']

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [47]:
from catboost import CatBoostClassifier

In [50]:

cat_features = list(range(0, X.shape[1]))
clf = CatBoostClassifier(
    iterations=10, 
    learning_rate=0.01, 
    custom_loss=['AUC', 'Precision'],
    loss_function='CrossEntropy'
)


train = clf.fit(X_train, y_train, 
        # cat_features=cat_features, 
        eval_set=(X_val, y_val), 
        verbose=False,

        plot = True
)



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [48]:
print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())

CatBoost model is fitted: True
CatBoost model parameters:
{'iterations': 5, 'learning_rate': 0.1}


In [51]:
clf.get_params()

{'iterations': 10,
 'learning_rate': 0.01,
 'loss_function': 'CrossEntropy',
 'custom_loss': ['AUC', 'Precision']}