In [5]:
# url = 'https://raw.githubusercontent.com/HOGENT-Databases/DB3-Workshops/master/data/diabetes.csv'
# !wget -X diabetes.csv https://raw.githubusercontent.com/HOGENT-Databases/DB3-Workshops/master/data/diabetes.csv

In [33]:
import pandas as pd
import numpy as np
import sklearn
import autosklearn
from sklearn.model_selection import train_test_split
from autosklearn.classification import AutoSklearnClassifier

In [27]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
has_zero_observation = (df['BloodPressure'] == 0) | (df['Glucose'] == 0) | (df['BMI'] == 0)
print(f"Number of rows that will be removed: {has_zero_observation.sum()}")
diabetes = df[~has_zero_observation]
num_rows, num_cols = diabetes.shape
print(f"num_rows: {num_rows}, num_cols: {num_cols}")

Number of rows that will be removed: 44
num_rows: 724, num_cols: 9


In [29]:
y = diabetes['Outcome']
X = diabetes.drop(['Outcome'], axis='columns')


In [32]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Number of training examples: {X_train.shape[0]}, number of features {X_train.shape[1]}")
print(f"Number of test examples: {X_test.shape[0]}")

Number of training examples: 506, number of features 8
Number of test examples: 218


In [14]:
def get_and_fit_model(X_train, y_train, dataset_name, time_left_for_this_task=30, per_run_time_limit=10):
  model = AutoSklearnClassifier(
    # time_left_for_this_task=30,
    # per_run_time_limit=10
    time_left_for_this_task=time_left_for_this_task,
    per_run_time_limit=per_run_time_limit
  )
  model.fit(X_train, y_train, dataset_name=dataset_name)
  return model

In [15]:
model = get_and_fit_model(X_train, y_train, dataset_name="HeartFailure", time_left_for_this_task=120, per_run_time_limit=30)

In [17]:
def get_scores_and_predictions(model, X_train, y_train, X_test, y_test):
  training_score = model.score(X_train, y_train)
  testing_score = model.score(X_test, y_test)
  predictions = model.predict(X_test)
  predictions_probas = model.predict_proba(X_test)
  return training_score, testing_score, predictions, predictions_probas

In [18]:
training_score, testing_score, predictions, predictions_probas = get_scores_and_predictions(model, X_train, y_train, X_test, y_test)
print("training_score: {training_score}".format(training_score=training_score))
print("testing_score: {testing_score}".format(testing_score=testing_score))
print("predictions: {predictions}".format(predictions=predictions))
print("predictions_probas: {predictions_probas}".format(predictions_probas=predictions_probas))

training_score: 0.8656126482213439
testing_score: 0.7614678899082569
predictions: [0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0
 0 1 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0
 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1]
predictions_probas: [[0.66753913 0.33246087]
 [0.85520008 0.14479992]
 [0.40676898 0.59323102]
 [0.43456634 0.56543366]
 [0.3063407  0.6936593 ]
 [0.53205178 0.46794822]
 [0.71487909 0.28512091]
 [0.71066266 0.28933734]
 [0.45082276 0.54917724]
 [0.54794773 0.45205227]
 [0.82797616 0.17202384]
 [0.82335085 0.17664915]
 [0.55444437 0.44555563]
 [0.87253008 0.12746992]
 [0.70958959 0.29041041]
 [0.7519974  0.2480026 ]
 [0.68103957 0.31896043]
 [0.7254651  0.2745349 ]
 [0.5

In [23]:
accuracy_score=sklearn.metrics.accuracy_score(y_test, predictions)
print(accuracy_score)
precision_score=sklearn.metrics.precision_score(y_test, predictions)
print(precision_score)

0.7614678899082569
0.6440677966101694
