# KNN

- Binary Classification
- Evaluation Methods

## 1. Load Data

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
# Read in csv file for ibm_telco that is stored in path:
url = 'https://raw.githubusercontent.com/RogerCui-GitHub/Data-Mining/master/datasets/ibm_telco_processed.csv'
df = pd.read_csv(url) #'./datasets/ibm_telco_processed.csv')

## 2. View Data

In [3]:
#Preview the dataset
df.head(3)

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_le,Partner_le,Dependents_le,PhoneService_le,MultipleLines_le,InternetService_le,OnlineSecurity_le,OnlineBackup_le,DeviceProtection_le,TechSupport_le,StreamingTV_le,StreamingMovies_le,Contract_le,PaperlessBilling_le,PaymentMethod_le,CHURN
0,7590-VHVEG,1,29.85,29.85,No,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,5575-GNVDE,34,56.95,1889.5,No,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0
2,3668-QPYBK,2,53.85,108.15,Yes,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1


In [4]:
# We again see that the describe function only describes continuous variables
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75
gender_le,7043.0,0.504756,0.500013,0.0,0.0,1.0,1.0,1.0
Partner_le,7043.0,0.483033,0.499748,0.0,0.0,0.0,1.0,1.0
Dependents_le,7043.0,0.299588,0.45811,0.0,0.0,0.0,1.0,1.0
PhoneService_le,7043.0,0.903166,0.295752,0.0,1.0,1.0,1.0,1.0
MultipleLines_le,7043.0,0.940508,0.948554,0.0,0.0,1.0,2.0,2.0
InternetService_le,7043.0,0.872923,0.737796,0.0,0.0,1.0,1.0,2.0
OnlineSecurity_le,7043.0,0.790004,0.859848,0.0,0.0,1.0,2.0,2.0
OnlineBackup_le,7043.0,0.906432,0.880162,0.0,0.0,1.0,2.0,2.0


In [5]:
#Check datatypes
df.dtypes

customerID              object
tenure                   int64
MonthlyCharges         float64
TotalCharges            object
Churn                   object
gender_le                int64
Partner_le               int64
Dependents_le            int64
PhoneService_le          int64
MultipleLines_le         int64
InternetService_le       int64
OnlineSecurity_le        int64
OnlineBackup_le          int64
DeviceProtection_le      int64
TechSupport_le           int64
StreamingTV_le           int64
StreamingMovies_le       int64
Contract_le              int64
PaperlessBilling_le      int64
PaymentMethod_le         int64
CHURN                    int64
dtype: object

## 3. Process Data

In [6]:
target = df[['CHURN', 'customerID']]

In [7]:
variables = df.drop(['Churn', 'CHURN', 'customerID'], axis=1)
variables.head(3)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_le,Partner_le,Dependents_le,PhoneService_le,MultipleLines_le,InternetService_le,OnlineSecurity_le,OnlineBackup_le,DeviceProtection_le,TechSupport_le,StreamingTV_le,StreamingMovies_le,Contract_le,PaperlessBilling_le,PaymentMethod_le
0,1,29.85,29.85,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2
1,34,56.95,1889.5,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3
2,2,53.85,108.15,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3


In [8]:
# We are going to convert the TotalCharges to numeric.  However, there are blank spaces in the column, which will
# cause errors as we are trying to convert it.  We can coerce these errors for now, which will make these black observations
# null, but we will have to either impute values for the logistic regression, or, drop the records.

#For simplicity, we will impute
variables['TotalCharges'] = variables['TotalCharges'].apply(pd.to_numeric, errors = 'coerce')

from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

variables['TotalCharges'] = variables['TotalCharges'].fillna(variables['TotalCharges'].mean())

In [9]:
variables.dtypes

tenure                   int64
MonthlyCharges         float64
TotalCharges           float64
gender_le                int64
Partner_le               int64
Dependents_le            int64
PhoneService_le          int64
MultipleLines_le         int64
InternetService_le       int64
OnlineSecurity_le        int64
OnlineBackup_le          int64
DeviceProtection_le      int64
TechSupport_le           int64
StreamingTV_le           int64
StreamingMovies_le       int64
Contract_le              int64
PaperlessBilling_le      int64
PaymentMethod_le         int64
dtype: object

In [10]:
# Again, we split the data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(variables, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)

## 4. Model Creation

In [11]:
# It is important to know that scikit-learn requires the variables to be numpy arrays for k-nearest neighbors.  Let's check that:
x_train_vars_only = x_train.apply(pd.to_numeric)
type(x_train)

pandas.core.frame.DataFrame

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_vars_only_s = scaler.fit_transform(x_train_vars_only)

scaler = StandardScaler()
x_test_vars_only_s = scaler.fit_transform(x_test)

type(x_train_vars_only_s)

numpy.ndarray

In [13]:
# Now, let's train the learner:
# Remember, the variables are already scaled at this point!  No need to repeat this.
from sklearn import neighbors
knn_clf = neighbors.KNeighborsClassifier(50, weights = 'uniform')

#### For weights we can use:
- ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

- ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

In [14]:
knn_model = knn_clf.fit(x_train_vars_only_s, y_train['CHURN'])

In [15]:
knn_model.score(x_train_vars_only_s, y_train['CHURN'])

0.7921547745828896

In [16]:
# 80% accuracy on train data!  What about test?
knn_model.score(x_test_vars_only_s, y_test['CHURN'])

0.8048261178140526

In [17]:
# We can find the probabilities produced by this model:
knn_probs_train = knn_model.predict_proba(x_train_vars_only_s)
knn_probs_test = knn_model.predict_proba(x_test_vars_only_s)
knn_probs_test

array([[0.84, 0.16],
       [0.76, 0.24],
       [0.46, 0.54],
       ...,
       [0.98, 0.02],
       [0.84, 0.16],
       [0.44, 0.56]])

In [18]:
# Let's produce these preditions in the dataframe:
x_train.loc[:,'KNN_PROBS'] = knn_probs_train[:, 1]
x_test.loc[:,'KNN_PROBS'] = knn_probs_test[:, 1]
x_test.loc[:,'KNN_PREDS'] = knn_model.predict(x_test_vars_only_s)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [19]:
x_test.head(3)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_le,Partner_le,Dependents_le,PhoneService_le,MultipleLines_le,InternetService_le,OnlineSecurity_le,OnlineBackup_le,DeviceProtection_le,TechSupport_le,StreamingTV_le,StreamingMovies_le,Contract_le,PaperlessBilling_le,PaymentMethod_le,KNN_PROBS,KNN_PREDS
3381,41,79.85,3320.75,0,0,0,1,0,0,2,0,2,2,2,2,1,1,0,0.16,0
6180,66,102.4,6471.85,0,0,0,1,2,1,2,0,0,0,2,2,2,1,0,0.24,0
4829,12,45.0,524.35,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0.54,1


In [20]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train['CHURN'], x_train['KNN_PROBS'])
metrics.auc(fpr, tpr)

0.8331547011903855

In [21]:
fpr, tpr, thresholds = metrics.roc_curve(y_test['CHURN'], x_test['KNN_PROBS'])
metrics.auc(fpr, tpr)

0.8417414172272959

In [22]:
# We can also find the best value of K for our model with cross-validation:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
standardizer = StandardScaler()

In [23]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean', n_jobs=-1).fit(x_train_vars_only_s, y_train['CHURN'])

In [24]:
# Create a pipeline
pipe = Pipeline([('standardizer', standardizer), ('knn', knn)])

# Create space of candidate values
search_space = [{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}]

In [25]:
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(x_train_vars_only_s, y_train['CHURN'])

In [26]:
# Best neighborhood size (k)
clf.best_estimator_.get_params()['knn__n_neighbors']

10