In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
X = df.copy()
y = X.pop("Outcome")

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

# Select the most influential features

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores

Glucose                     0.210856
Insulin                     0.192067
Age                         0.097693
BMI                         0.065320
SkinThickness               0.056605
Pregnancies                 0.042854
BloodPressure               0.041110
DiabetesPedigreeFunction    0.005106
Name: MI Scores, dtype: float64

# Train XGBoost

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[107  23]
 [ 23  39]]


0.7604166666666666

# Save the model

In [None]:
import joblib
joblib.dump(classifier, "ml_model76%")

['ml_model76%']