In [33]:
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import VotingClassifier

In [34]:
df = pd.read_csv('./../clean_data.csv')

In [35]:
labels = df.columns[1:]
x=df.loc[:,labels]
y=df.loc[:,'churn']

xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8)

In [36]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(xtrain, ytrain)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after oversampling:
churn
No     3277
Yes    3277
Name: count, dtype: int64

Class distribution after undersampling:
churn
No     513
Yes    513
Name: count, dtype: int64


In [42]:
results= [['name','accuracy','recall','prec','f1']]
def evaluate_model(model,name,X_test,y_test,dec_number):
    yhat = model.predict(X_test)

    acc = round(sklearn.metrics.accuracy_score(y_test,yhat),dec_number)
    recall = round(sklearn.metrics.recall_score(y_test,yhat,pos_label='Yes'),dec_number)
    prec = round(sklearn.metrics.precision_score(y_test,yhat,pos_label='Yes'),dec_number)
    f1 = round(sklearn.metrics.f1_score(y_test,yhat,pos_label='Yes'),dec_number)

    results.append([name,acc,recall,prec,f1])
    
    
def print_results(results):
    # Print each row
    for row in results:
        row_str = " | ".join(str(element) for element in row)
        print(row_str)

In [44]:
params = {
    'n_neighbors': [3,5,7,9,11,13],
    'weights': ['uniform', 'distance'],
    'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'p' : [1,2,3,4,5]
    }


clf_accuracy = GridSearchCV(estimator=KNeighborsClassifier(),param_grid=params,cv=5,n_jobs=10,verbose=1,scoring='accuracy')
clf_accuracy.fit(xtrain,ytrain)

print(clf_accuracy.best_params_)
model_accuracy = KNeighborsClassifier(**clf_accuracy.best_params_) 

f1 = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average='micro')

clf_f1 = GridSearchCV(estimator=KNeighborsClassifier(),param_grid=params,cv=5,n_jobs=5,verbose=1,scoring=f1)
clf_f1.fit(xtrain,ytrain)

print(clf_f1.best_params_)
modelf1 = KNeighborsClassifier(n_neighbors=clf_f1.best_params_['n_neighbors'], weights=clf_f1.best_params_['weights'], algorithm=clf_f1.best_params_['algorithm'], p=clf_f1.best_params_['p']) 

combi_model = VotingClassifier(estimators=[('acc', model_accuracy), ('f1', modelf1)], voting='soft')


Fitting 5 folds for each of 180 candidates, totalling 900 fits
{'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
Fitting 5 folds for each of 180 candidates, totalling 900 fits
{'algorithm': 'ball_tree', 'n_neighbors': 7, 'p': 1, 'weights': 'distance'}


In [39]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model_accuracy, x, y, cv=cv_strategy, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy:", cross_val_scores.mean())
print("Standard Deviation:", cross_val_scores.std())

Cross-Validation Scores: [0.89556962 0.90400844 0.90506329 0.89545935 0.90813094]
Mean Accuracy: 0.9016463270643694
Standard Deviation: 0.005186723954450055


In [40]:
model_accuracy.fit(X_resampled,y_resampled)

yhat = model_accuracy.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('balanced model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

modelf1.fit(X_resampled,y_resampled)

yhat = modelf1.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('f1 model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

combi_model.fit(X_resampled,y_resampled)

yhat = combi_model.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('combination model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

balanced model
accuracy: 0.8185654008438819
recall: 0.7819548872180451
precision: 0.42105263157894735
f1: 0.5473684210526316
confusion matrix:
[[672 143]
 [ 29 104]]

f1 model
accuracy: 0.8185654008438819
recall: 0.7819548872180451
precision: 0.42105263157894735
f1: 0.5473684210526316
confusion matrix:
[[672 143]
 [ 29 104]]

combination model
accuracy: 0.8185654008438819
recall: 0.7819548872180451
precision: 0.42105263157894735
f1: 0.5473684210526316
confusion matrix:
[[672 143]
 [ 29 104]]



In [43]:
evaluate_model(model_accuracy,'model',xtest,ytest,3)
print_results(results)

name | accuracy | recall | prec | f1
model | 0.819 | 0.782 | 0.421 | 0.547
