# K-Nearest Neighbor

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.model_selection

from acquire import get_titanic_data
from prepare import prep_titanic

titanic = get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [2]:
def encode_sex(df):
    '''
    Returns a new dataframe with the ``sex`` column encoded.
    '''
    return df.assign(
        sex=(df.sex == 'female').astype(int)
    )

In [3]:
def get_splits(titanic):
    '''
    Returns X and y for train, validate and test datasets
    '''
    # don't blow away our original data
    titanic = titanic.copy()
    
    # ignore warnings just for this block
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        scaler, encoder, train, test = prep_titanic(titanic)
    
    # Which features are we going to look at?
    cols = ['survived', 'pclass', 'sex', 'age', 'alone']
    train = train[cols]
    test = test[cols]

    # validate data split
    train, validate = sklearn.model_selection.train_test_split(
        train, train_size=.85, random_state=123
    )

    # split into X and y
    X_train, y_train = train.drop(columns='survived'), train.survived
    X_validate, y_validate = validate.drop(columns='survived'), validate.survived
    X_test, y_test = test.drop(columns='survived'), test.survived
    
    X_train = encode_sex(X_train)
    X_validate = encode_sex(X_validate)
    X_test = encode_sex(X_test)
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

In [4]:
X_train, y_train, X_validate, y_validate, X_test, y_test = get_splits(titanic)

print('   train: %d rows' % X_train.shape[0])
print('validate: %d rows' % X_validate.shape[0])
print('    test: %d rows' % X_test.shape[0])

   train: 567 rows
validate: 101 rows
    test: 223 rows


In [5]:
# a dataframe to hold our models' predictions for future comparison
evaluation = pd.DataFrame({
    'actual': y_validate
})

# Exercises

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [6]:
# survived ~ pclass + age
X_train, y_train, X_validate, y_validate, X_test, y_test = get_splits(titanic)
X_train = X_train.drop(columns=['alone', 'sex'])
X_validate = X_validate.drop(columns=['alone', 'sex'])

# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [7]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.76


3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [8]:
print(confusion_matrix(y_train, y_pred))

[[319  34]
 [102 112]]


In [9]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82       353
           1       0.77      0.52      0.62       214

    accuracy                           0.76       567
   macro avg       0.76      0.71      0.72       567
weighted avg       0.76      0.76      0.75       567



4. Run through steps 2-4 setting k to 10

In [10]:
# survived ~ pclass + age + sex
X_train, y_train, X_validate, y_validate, X_test, y_test = get_splits(titanic)
X_train = X_train.drop(columns=['alone'])
X_validate = X_validate.drop(columns=['alone'])

# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.81


In [11]:
print(confusion_matrix(y_train, y_pred))

[[335  18]
 [ 87 127]]


In [12]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.95      0.86       353
           1       0.88      0.59      0.71       214

    accuracy                           0.81       567
   macro avg       0.83      0.77      0.79       567
weighted avg       0.82      0.81      0.81       567



5. Run through setps 2-4 setting k to 20

In [13]:
# survived ~ pclass + age + sex + alone
X_train, y_train, X_validate, y_validate, X_test, y_test = get_splits(titanic)

# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

# evaluate model
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.80


In [14]:
print(confusion_matrix(y_train, y_pred))

[[327  26]
 [ 88 126]]


In [15]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.85       353
           1       0.83      0.59      0.69       214

    accuracy                           0.80       567
   macro avg       0.81      0.76      0.77       567
weighted avg       0.80      0.80      0.79       567



6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

# Test

For both the `iris` and the `titanic data`,

1. Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).
   

2. Create a new dataframe with top 4 features.
    

3. Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).
    

4. Run your final model on your out-of-sample dataframe (`test_df`). Evaluate the results.

# Feature Engineering

- Titanic Data
  - Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?
  - Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?

- Iris Data
  - Create features named petal_area and sepal_area.