## Question 1 - Preprocessing 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# Step 1: Load the dataset and have a look at it
df = pd.read_csv('../Data/diabetes.csv')
print(df.head(10))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   
8            2      197             70             45      543  30.5   
9            8      125             96              0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   2

In [5]:
# Step 2: Split the dataset into the explanatory variables (X) and the target variable (y)
X = df.drop('Outcome', axis=1) 
y = df['Outcome'] # y = diabetes.Outcome 
print(X.shape, y.shape)

(768, 8) (768,)


In [6]:
X.describe

<bound method NDFrame.describe of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1  

We can consider that all our variables are continuous (even Pregnancies to some extent), therefore Euclidean distance is appropriate for our k-NN model.

In [7]:
# Step 3: Perform a train-test split using 60% for the training set and 40% for the test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=0.6, #test_size = 0.4
                                                    random_state=0)

In [8]:
# Step 4: Use MinMaxScaler to scale the data
minmaxscaler = MinMaxScaler()
minmaxscaler.fit(X_train) # NOT on the entirety 
X_train_mms = minmaxscaler.transform(X_train)
X_test_mms = minmaxscaler.transform(X_test)

In [9]:
minmaxscaler.data_min_

array([ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.084, 21.   ])

In [10]:
minmaxscaler.data_max_

array([ 15.  , 198.  , 122.  ,  63.  , 846.  ,  67.1 ,   2.42,  81.  ])

In [12]:
pd.DataFrame(X_test_mms).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,308.0,308.0,308.0,308.0,308.0,308.0,308.0,308.0
mean,0.260823,0.5996,0.549846,0.313028,0.07479,0.470465,0.165978,0.200054
std,0.2237,0.156863,0.164876,0.262954,0.118638,0.116431,0.136106,0.194125
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.002568,0.0
25%,0.066667,0.497475,0.508197,0.0,0.0,0.396423,0.070098,0.05
50%,0.2,0.570707,0.57377,0.333333,0.0,0.470939,0.123716,0.133333
75%,0.4,0.691919,0.639344,0.507937,0.118203,0.540984,0.234482,0.333333
max,1.133333,1.005051,0.934426,1.571429,0.803783,0.788376,0.774401,0.85


In [None]:
# Step 5: Train a k-NN model with k=7 and compute the score on the training and test sets
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)

train_score = knn.score(X_train_scaled, y_train)
test_score = knn.score(X_test_scaled, y_test)

print(f'Training Score: {train_score}')
print(f'Test Score: {test_score}')