# Supervised Learning with scikit-learn
Run the hidden code cell below to import the data used in this course.

In [10]:
# Importing pandas
import pandas as pd
import numpy as np

# Importing the course datasets 
diabetes = pd.read_csv('datasets/diabetes_clean.csv')
music = pd.read_csv('datasets/music_clean.csv')
advertising = pd.read_csv('datasets/advertising_and_sales_clean.csv')
telecom = pd.read_csv("datasets/telecom_churn_clean.csv")

print(telecom)

      Unnamed: 0  account_length  ...  customer_service_calls  churn
0              0             128  ...                       1      0
1              1             107  ...                       1      0
2              2             137  ...                       0      0
3              3              84  ...                       2      0
4              4              75  ...                       3      0
...          ...             ...  ...                     ...    ...
3328        3328             192  ...                       2      0
3329        3329              68  ...                       3      0
3330        3330              28  ...                       2      0
3331        3331             184  ...                       2      0
3332        3332              74  ...                       0      0

[3333 rows x 20 columns]


## Classification

1. Build a Model
2. Model learns from the labeled data we pass to it
3. Pass unlabeled data to the model as input
4. Model predicts the labels of the unseen data

Note: `Labeled data = training data`

## k-Nearest Neighbors

* Predict the label of a data point by
  * Looking at the `k` closest labeled data points
  * Taking a majority vote

<img src="k-nn.png" 
     width="400" 
     height="500">

## KNN Intuition
<br></br>
<img src="k-nn-intuition.png" >


## k-Nearest Neighbors: Fit

In [7]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier 

# Create arrays for the features and the target variable
y = telecom["churn"].values
X = telecom[["account_length", "customer_service_calls"]].values

# Create a KNN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors = 6)

# Fit the classifier to the data
knn.fit(X, y)

## k-Nearest Neighbors: Predict

In [12]:
# Set of new data points:
X_new = np.array([[30.0, 17.5],
                  [107.0, 24.1],
                  [213.0, 10.9]])

In [13]:
# Predict the labels for the X_new
y_pred = knn.predict(X_new)

# Print the predictions for X_new
print("Predictions: {}".format(y_pred)) 

Predictions: [0 1 0]


## Train/test split + computing accuracy
* In classification, accuracy is a commonly used metric
* <b>Accuracy:</b>
<img src="Train-Test-Split.png" 
     width="400" 
     height="500">

## Computing accuracy
<img src="Computing-accuracy.png">

## Model complexity
<img src="Model-complexity.png">


In [16]:
# Import the module
from sklearn.model_selection import train_test_split

X = telecom.drop("churn", axis=1).values
y = telecom["churn"].values

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Print the accuracy
print(knn.score(X_test, y_test))

0.8545727136431784


## Overfitting and underfitting

In [19]:
# Create neighbors
neighbors = np.arange(1, 13)
train_accuracies = {}
test_accuracies = {}

for neighbor in neighbors:
  
	# Set up a KNN Classifier
	knn = KNeighborsClassifier(n_neighbors=neighbor)
  
	# Fit the model
	knn.fit(X_train, y_train)
  
	# Compute accuracy
	train_accuracies[neighbor] = knn.score(X_train, y_train)
	test_accuracies[neighbor] = knn.score(X_test, y_test)
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)

[ 1  2  3  4  5  6  7  8  9 10 11 12] 
 {1: 1.0, 2: 0.8885971492873218, 3: 0.8994748687171793, 4: 0.8750937734433608, 5: 0.878469617404351, 6: 0.8660915228807202, 7: 0.8705926481620405, 8: 0.8615903975993998, 9: 0.86384096024006, 10: 0.858589647411853, 11: 0.8604651162790697, 12: 0.8574643660915229} 
 {1: 0.7856071964017991, 2: 0.8470764617691154, 3: 0.8320839580209896, 4: 0.856071964017991, 5: 0.8545727136431784, 6: 0.8590704647676162, 7: 0.8605697151424287, 8: 0.8620689655172413, 9: 0.863568215892054, 10: 0.8605697151424287, 11: 0.8605697151424287, 12: 0.8605697151424287}
