# Acquisition

#### imports

In [318]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from env import get_connection
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from prepare import encode_categorical_columns, train_val_test, theometrics

Acquiring the telco_churn dataset

In [319]:
db_url = get_connection('telco_churn')

query = '''
       SELECT 
    customers.customer_id,
    customers.gender,
    customers.senior_citizen,
    customers.partner,
    customers.dependents,
    customers.tenure,
    customers.phone_service,
    customers.multiple_lines,
    customers.online_security,
    customers.online_backup,
    customers.device_protection,
    customers.tech_support,
    customers.streaming_tv,
    customers.streaming_movies,
    customers.paperless_billing,
    customers.monthly_charges,
    customers.total_charges,
    customers.churn,
    payment_types.payment_type,
    contract_types.contract_type,
    internet_service_types.internet_service_type
FROM
    customers
        LEFT JOIN
    customer_details ON customer_details.customer_id = customers.customer_id
        LEFT JOIN
    customer_contracts ON customer_contracts.customer_id = customer_details.customer_id
        LEFT JOIN
    customer_payments ON customer_payments.customer_id = customer_contracts.customer_id
        LEFT JOIN
    customer_signups ON customer_signups.customer_id = customer_payments.customer_id
        LEFT JOIN
    customer_subscriptions ON customer_subscriptions.customer_id = customer_signups.customer_id
        LEFT JOIN
    customer_churn ON customer_churn.customer_id = customer_subscriptions.customer_id
        LEFT JOIN
    payment_types ON payment_types.payment_type_id = customers.payment_type_id
        LEFT JOIN
    contract_types ON contract_types.contract_type_id = customer_contracts.contract_type_id
        LEFT JOIN
    internet_service_types ON internet_service_types.internet_service_type_id = customers.internet_service_type_id;
        '''

In [320]:
#reading the sql query into a dataframe

telco_df = pd.read_sql(query, db_url)
telco_df.head().T

Unnamed: 0,0,1,2,3,4
customer_id,0002-ORFBO,0003-MKNFE,0004-TLHLJ,0011-IGKFF,0013-EXCHZ
gender,Female,Male,Male,Male,Female
senior_citizen,0,0,0,1,1
partner,Yes,No,No,Yes,Yes
dependents,Yes,No,No,No,No
tenure,9,9,4,13,3
phone_service,Yes,Yes,Yes,Yes,Yes
multiple_lines,No,Yes,No,No,No
online_security,No,No,No,No,No
online_backup,Yes,No,No,Yes,No


# preparation

cleaning up telco dataset

In [321]:
#dropping the customer_id column

telco_df = telco_df.drop(columns = ['customer_id'])
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,Mailed check,One year,DSL
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Mailed check,Month-to-month,DSL
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Electronic check,Month-to-month,Fiber optic
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Electronic check,Month-to-month,Fiber optic
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Mailed check,Month-to-month,Fiber optic


In [322]:
#checking how many columns and rows ar in the dataframe

telco_df.shape

(7043, 20)

In [323]:
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 7043 non-null   object 
 1   senior_citizen         7043 non-null   int64  
 2   partner                7043 non-null   object 
 3   dependents             7043 non-null   object 
 4   tenure                 7043 non-null   int64  
 5   phone_service          7043 non-null   object 
 6   multiple_lines         7043 non-null   object 
 7   online_security        7043 non-null   object 
 8   online_backup          7043 non-null   object 
 9   device_protection      7043 non-null   object 
 10  tech_support           7043 non-null   object 
 11  streaming_tv           7043 non-null   object 
 12  streaming_movies       7043 non-null   object 
 13  paperless_billing      7043 non-null   object 
 14  monthly_charges        7043 non-null   float64
 15  tota

In [324]:
#checking all rows with no values

(telco_df == ' ').sum()

gender                    0
senior_citizen            0
partner                   0
dependents                0
tenure                    0
phone_service             0
multiple_lines            0
online_security           0
online_backup             0
device_protection         0
tech_support              0
streaming_tv              0
streaming_movies          0
paperless_billing         0
monthly_charges           0
total_charges            11
churn                     0
payment_type              0
contract_type             0
internet_service_type     0
dtype: int64

In [325]:
#checking all rows in total_charges with no values

telco_df[(telco_df.total_charges == ' ')]

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
945,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,Yes,Yes,Yes,No,No,56.05,,No,Credit card (automatic),Two year,DSL
1731,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.0,,No,Mailed check,Two year,
1906,Male,0,No,Yes,0,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,61.9,,No,Bank transfer (automatic),Two year,DSL
2025,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,19.7,,No,Mailed check,One year,
2176,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.25,,No,Mailed check,Two year,
2250,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.35,,No,Mailed check,Two year,
2855,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,73.35,,No,Mailed check,Two year,DSL
3052,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.75,,No,Mailed check,Two year,
3118,Female,0,Yes,Yes,0,No,No phone service,Yes,No,Yes,Yes,Yes,No,Yes,52.55,,No,Bank transfer (automatic),Two year,DSL
4054,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,80.85,,No,Mailed check,Two year,DSL


In [326]:
#replacing all the total charges rows with no values with 0

telco_df['total_charges'] = telco_df['total_charges'].replace(' ', 0)

In [327]:
#verifying all the rows with 0 

telco_df[(telco_df.total_charges == 0)]

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
945,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,Yes,Yes,Yes,No,No,56.05,0,No,Credit card (automatic),Two year,DSL
1731,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.0,0,No,Mailed check,Two year,
1906,Male,0,No,Yes,0,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,61.9,0,No,Bank transfer (automatic),Two year,DSL
2025,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,19.7,0,No,Mailed check,One year,
2176,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.25,0,No,Mailed check,Two year,
2250,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.35,0,No,Mailed check,Two year,
2855,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,73.35,0,No,Mailed check,Two year,DSL
3052,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.75,0,No,Mailed check,Two year,
3118,Female,0,Yes,Yes,0,No,No phone service,Yes,No,Yes,Yes,Yes,No,Yes,52.55,0,No,Bank transfer (automatic),Two year,DSL
4054,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,80.85,0,No,Mailed check,Two year,DSL


In [328]:
#checking all object types in the dataset

telco_df.dtypes == 'int'

gender                   False
senior_citizen            True
partner                  False
dependents               False
tenure                    True
phone_service            False
multiple_lines           False
online_security          False
online_backup            False
device_protection        False
tech_support             False
streaming_tv             False
streaming_movies         False
paperless_billing        False
monthly_charges          False
total_charges            False
churn                    False
payment_type             False
contract_type            False
internet_service_type    False
dtype: bool

In [329]:
#checking all the null values in the dataset

telco_df.isna().sum()

gender                   0
senior_citizen           0
partner                  0
dependents               0
tenure                   0
phone_service            0
multiple_lines           0
online_security          0
online_backup            0
device_protection        0
tech_support             0
streaming_tv             0
streaming_movies         0
paperless_billing        0
monthly_charges          0
total_charges            0
churn                    0
payment_type             0
contract_type            0
internet_service_type    0
dtype: int64

In [330]:
#encoding the categorical columns

categorical_columns = ['gender', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'online_security',
                       'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies',
                       'contract_type', 'internet_service_type', 'payment_type', 'churn', 'paperless_billing']

In [331]:
telco_df = encode_categorical_columns(telco_df, categorical_columns)
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,payment_type,contract_type,internet_service_type
0,0,0,1,1,9,1,0,0,2,0,2,2,0,Yes,65.6,593.3,0,3,1,0
1,1,0,0,0,9,1,2,0,0,0,0,0,2,No,59.9,542.4,0,3,0,0
2,1,0,0,0,4,1,0,0,0,2,0,0,0,Yes,73.9,280.85,1,2,0,1
3,1,1,1,0,13,1,0,0,2,2,0,2,2,Yes,98.0,1237.85,1,2,0,1
4,0,1,1,0,3,1,0,0,0,0,2,2,0,Yes,83.9,267.4,1,3,0,1


In [332]:
#encoding multi-categorical columns

telco_df = pd.get_dummies(telco_df, columns =  ['multiple_lines', 'online_security', 'online_backup',
                                     'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 
                                     'payment_type', 'contract_type', 'internet_service_type'])
telco_df = pd.get_dummies(telco_df, columns = ['paperless_billing'], drop_first = True)
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,monthly_charges,total_charges,churn,multiple_lines_0,...,payment_type_1,payment_type_2,payment_type_3,contract_type_0,contract_type_1,contract_type_2,internet_service_type_0,internet_service_type_1,internet_service_type_2,paperless_billing_Yes
0,0,0,1,1,9,1,65.6,593.3,0,True,...,False,False,True,False,True,False,True,False,False,True
1,1,0,0,0,9,1,59.9,542.4,0,False,...,False,False,True,True,False,False,True,False,False,False
2,1,0,0,0,4,1,73.9,280.85,1,True,...,False,True,False,True,False,False,False,True,False,True
3,1,1,1,0,13,1,98.0,1237.85,1,True,...,False,True,False,True,False,False,False,True,False,True
4,0,1,1,0,3,1,83.9,267.4,1,True,...,False,False,True,True,False,False,False,True,False,True


In [333]:
telco_df.shape

(7043, 41)

In [334]:
telco_df = telco_df.drop(columns = ['online_backup_1', 'tech_support_1', 'online_security_1', 
                         'device_protection_1', 'multiple_lines_1', 'phone_service', 
                         'streaming_tv_1', 'internet_service_type_2', 'streaming_movies_1'
                                   ]) 

In [335]:
telco_df.shape

(7043, 32)

# Exploration and preprocessing

### splitting the dataset into train, validate and test subsets

In [336]:
train, val, test = train_val_test(telco_df, strat = 'churn')
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,multiple_lines_0,multiple_lines_2,...,payment_type_0,payment_type_1,payment_type_2,payment_type_3,contract_type_0,contract_type_1,contract_type_2,internet_service_type_0,internet_service_type_1,paperless_billing_Yes
5609,1,0,0,0,14,76.45,1117.55,0,True,False,...,False,False,True,False,True,False,False,False,True,False
2209,1,0,0,0,5,70.0,347.4,1,True,False,...,False,False,False,True,False,True,False,True,False,True
6919,1,0,1,0,35,75.2,2576.2,1,False,True,...,False,False,True,False,True,False,False,False,True,True
2284,1,0,1,0,58,86.1,4890.5,0,False,True,...,False,False,True,False,False,False,True,True,False,True
845,0,0,0,0,2,49.6,114.7,1,True,False,...,False,False,False,True,True,False,False,True,False,True


In [337]:
#checking the number of columns and rows in each subset

train.shape, val.shape, test.shape

((4930, 32), (1056, 32), (1057, 32))

In [338]:
# creating an X & y version of train, val and test, where y is a series with just the target variable and X are all the features.

X_train = train.drop(columns = 'churn')
y_train = train.churn

X_val = val.drop(columns = 'churn')
y_val = val.churn

X_test = test.drop(columns = 'churn')
y_test = test.churn

In [339]:
# creating a baseline of the most occuring number in the survived column

(y_train == 0).mean()

0.734685598377282

Baseline accuracy is 73%

In [340]:
#verifying that x and y train have the same amount of rows

X_train.shape[:1], y_train.shape[:1]

((4930,), (4930,))

In [341]:
#verifying that x and y val have the same amount of rows

X_val.shape[:1], y_val.shape[:1]

((1056,), (1056,))

In [342]:
#verifying that x and y test have the same amount of rows

X_test.shape[:1], y_test.shape[:1]

((1057,), (1057,))

In [343]:
train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,churn,multiple_lines_0,multiple_lines_2,...,payment_type_0,payment_type_1,payment_type_2,payment_type_3,contract_type_0,contract_type_1,contract_type_2,internet_service_type_0,internet_service_type_1,paperless_billing_Yes
5609,1,0,0,0,14,76.45,1117.55,0,True,False,...,False,False,True,False,True,False,False,False,True,False
2209,1,0,0,0,5,70.0,347.4,1,True,False,...,False,False,False,True,False,True,False,True,False,True
6919,1,0,1,0,35,75.2,2576.2,1,False,True,...,False,False,True,False,True,False,False,False,True,True
2284,1,0,1,0,58,86.1,4890.5,0,False,True,...,False,False,True,False,False,False,True,True,False,True
845,0,0,0,0,2,49.6,114.7,1,True,False,...,False,False,False,True,True,False,False,True,False,True


In [344]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['tenure', 'monthly_charges', 'total_charges']] = mms.fit_transform(X_train[['tenure', 'monthly_charges', 'total_charges']])
X_val[['tenure', 'monthly_charges', 'total_charges']] = mms.transform(X_val[['tenure', 'monthly_charges', 'total_charges']])
X_test[['tenure', 'monthly_charges', 'total_charges']] = mms.transform(X_test[['tenure', 'monthly_charges', 'total_charges']])

X_train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,monthly_charges,total_charges,multiple_lines_0,multiple_lines_2,online_security_0,...,payment_type_0,payment_type_1,payment_type_2,payment_type_3,contract_type_0,contract_type_1,contract_type_2,internet_service_type_0,internet_service_type_1,paperless_billing_Yes
5609,1,0,0,0,0.194444,0.578475,0.128679,True,False,True,...,False,False,True,False,True,False,False,False,True,False
2209,1,0,0,0,0.069444,0.5142,0.040001,True,False,True,...,False,False,False,True,False,True,False,True,False,True
6919,1,0,1,0,0.486111,0.566019,0.296633,False,True,True,...,False,False,True,False,True,False,False,False,True,True
2284,1,0,1,0,0.805556,0.674639,0.56311,False,True,False,...,False,False,True,False,False,False,True,True,False,True
845,0,0,0,0,0.027778,0.310912,0.013207,True,False,True,...,False,False,False,True,True,False,False,True,False,True


### modeling

#### model 1

In [345]:
#creating a model and fitting it to our train set

seed = 42

rf = RandomForestClassifier(max_depth = 8, min_samples_leaf = 7, random_state = seed)

rf.fit(X_train, y_train)

In [346]:
#train model score

rf.score(X_train, y_train)

0.8300202839756592

In [347]:
#validate model score

rf.score(X_val, y_val)

0.8039772727272727

In [348]:
t_pred = rf.predict(X_train)
t_pred[:5]

array([0, 0, 1, 0, 0])

In [349]:
v_pred = rf.predict(X_val)
v_pred[:5]

array([0, 0, 0, 0, 1])

In [350]:
#checking the importance of each feature in the dataset before making predictions

rf.feature_importances_

array([0.00770559, 0.00831973, 0.00795943, 0.00755928, 0.15948442,
       0.07830179, 0.10454558, 0.00644501, 0.0078437 , 0.06704834,
       0.01240007, 0.02350327, 0.0082417 , 0.01982384, 0.00418228,
       0.07070717, 0.01245629, 0.00674605, 0.00690064, 0.00537752,
       0.00669755, 0.00519544, 0.00493247, 0.05239282, 0.00425189,
       0.13529772, 0.02686298, 0.04213504, 0.01943576, 0.05850772,
       0.01873888])

In [351]:
#making sure all my features add up to 1

np.array([0.00770559, 0.00831973, 0.00795943, 0.00755928, 0.15948442,
       0.07830179, 0.10454558, 0.00644501, 0.0078437 , 0.06704834,
       0.01240007, 0.02350327, 0.0082417 , 0.01982384, 0.00418228,
       0.07070717, 0.01245629, 0.00674605, 0.00690064, 0.00537752,
       0.00669755, 0.00519544, 0.00493247, 0.05239282, 0.00425189,
       0.13529772, 0.02686298, 0.04213504, 0.01943576, 0.05850772,
       0.01873888]).sum()

0.9999999700000001

In [352]:
#visualizing the important features in the dataset from most to least important

fi = pd.DataFrame({'feature' : X_train.columns,
              'importance' : rf.feature_importances_})

fi.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
4,tenure,0.159484
25,contract_type_0,0.135298
6,total_charges,0.104546
5,monthly_charges,0.078302
15,tech_support_0,0.070707
9,online_security_0,0.067048
29,internet_service_type_1,0.058508
23,payment_type_2,0.052393
27,contract_type_2,0.042135
26,contract_type_1,0.026863


In [353]:
seed = 42
train_acc = []
val_acc = []
depth = []
min_samples_leaf = []

for i in (reversed(range(5, 10))):

    for n in range(5, 10):

        rf = RandomForestClassifier(max_depth = i, min_samples_leaf = n, random_state = seed)
    
        rf.fit(X_train, y_train)
    
        train_acc.append(rf.score(X_train, y_train))
    
        val_acc.append(rf.score(X_val, y_val))
    
        depth.append(i)

        min_samples_leaf.append(n)

In [354]:
trees = pd.DataFrame({'max_depth' : depth, 'min_sam_leaf' : min_samples_leaf, 'train_acc' : train_acc, 'val_acc' : val_acc})
trees.sort_values(by = 'val_acc', ascending = False).head(10)

Unnamed: 0,max_depth,min_sam_leaf,train_acc,val_acc
5,8,5,0.830629,0.807765
1,9,6,0.838945,0.805871
0,9,5,0.839351,0.804924
8,8,8,0.828398,0.804924
10,7,5,0.824949,0.804924
2,9,7,0.836105,0.803977
13,7,8,0.823124,0.803977
7,8,7,0.83002,0.803977
14,7,9,0.823327,0.80303
9,8,9,0.827383,0.80303


In [None]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 10):

        rf = RandomForestClassifier(max_depth = i, min_samples_leaf = 7, random_state = seed)
    
        rf.fit(X_train, y_train)
    
        train_acc.append(rf.score(X_train, y_train))
    
        val_acc.append(rf.score(X_val, y_val))
    
        depth.append(i)

        min_samples_leaf.append(7)

In [None]:
trees = pd.DataFrame({'max_depth' : depth, 'train_acc' : train_acc, 'val_acc' : val_acc})
trees.sort_values(by = 'val_acc', ascending = False)

In [None]:
plt.plot(trees.max_depth, trees.train_acc, label = 'Train')
plt.plot(trees.max_depth, trees.val_acc, label = 'Validate')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
plt.title('Accuracy as max depth increases')
plt.legend()
plt.show()

### Model score

In [None]:
rf.score(X_train, y_train)

### Confusion matrix

In [None]:
confusion_matrix(y_train, t_pred)

### Classification report

In [None]:
print(classification_report(y_train, t_pred))

### Train metrics

In [None]:
theometrics(3341, 751, 557, 281)

### Validate metrics

In [None]:
confusion_matrix(y_val, v_pred)

In [None]:
theometrics(703, 154, 126, 73)

#### model 2

In [None]:
#defining my train and val subsets

X_train = train.drop(columns = ['churn']).iloc[:, :7]
y_train = train.churn

X_val = val.drop(columns = ['churn']).iloc[:, :7]
y_val = val.churn

In [None]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['tenure', 'monthly_charges', 'total_charges']] = mms.fit_transform(X_train[['tenure', 'monthly_charges', 'total_charges']])
X_val[['tenure', 'monthly_charges', 'total_charges']] = mms.transform(X_val[['tenure', 'monthly_charges', 'total_charges']])
X_test[['tenure', 'monthly_charges', 'total_charges']] = mms.transform(X_test[['tenure', 'monthly_charges', 'total_charges']])

X_train.head()

In [None]:
#defining the baseline

(y_train == 0).mean()

In [None]:

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)
v_pred = knn.predict(X_val)

In [None]:
#train model score

knn.score(X_train, y_train)

In [None]:
#validate model score

knn.score(X_val, y_val)

### confusion matrix

In [None]:
confusion_matrix(y_train, y_pred)

### classification report

In [None]:
print(classification_report(y_train, y_pred))

### train metrics

In [None]:
theometrics(3339, 770, 538, 283)

### validate metrics

In [None]:
confusion_matrix(y_val, v_pred)

In [None]:
theometrics(675, 131, 149, 101)