In [38]:
import pandas as pd
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import VotingClassifier

In [39]:
df = pd.read_csv('./../clean_data.csv')

In [40]:
labels = df.columns[1:]
x=df.loc[:,labels]
y=df.loc[:,'churn']

xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8)

In [46]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(xtrain, ytrain)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after oversampling:
No     3282
Yes    3282
Name: churn, dtype: int64

Class distribution after undersampling:
No     3282
Yes    3282
Name: churn, dtype: int64


In [42]:


model = KNeighborsClassifier()
params = {
    'n_neighbors': [3,5,7,9,11,13],
    'weights': ['uniform', 'distance'],
    'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'p' : [1,2,3,4,5]
    }


clf = GridSearchCV(estimator=model,param_grid=params,cv=5,n_jobs=5,verbose=1)
clf.fit(xtrain,ytrain)

print(clf.best_params_)
model = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], weights=clf.best_params_['weights'], algorithm=clf.best_params_['algorithm'], p=clf.best_params_['p']) 

f1 = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average='micro')

clf = GridSearchCV(estimator=model,param_grid=params,cv=5,n_jobs=5,verbose=1,scoring=f1)
clf.fit(xtrain,ytrain)

print(clf.best_params_)
modelf1 = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], weights=clf.best_params_['weights'], algorithm=clf.best_params_['algorithm'], p=clf.best_params_['p']) 

combi_model = VotingClassifier(estimators=[('acc', model), ('f1', modelf1)], voting='soft')


Fitting 5 folds for each of 180 candidates, totalling 900 fits
{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
Fitting 5 folds for each of 180 candidates, totalling 900 fits
{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}


In [43]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model, x, y, cv=cv_strategy, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy:", cross_val_scores.mean())
print("Standard Deviation:", cross_val_scores.std())

Cross-Validation Scores: [0.89978903 0.9092827  0.907173   0.88806758 0.89757128]
Mean Accuracy: 0.9003767170589783
Standard Deviation: 0.007552967900650159


In [47]:
model.fit(X_resampled,y_resampled)

yhat = model.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('balanced model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

modelf1.fit(X_resampled,y_resampled)

yhat = modelf1.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('f1 model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

combi_model.fit(X_resampled,y_resampled)

yhat = modelf1.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('combination model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

balanced model
accuracy: 0.8354430379746836
recall: 0.7246376811594203
precision: 0.45871559633027525
f1: 0.5617977528089888
confusion matrix:
[[692 118]
 [ 38 100]]

f1 model
accuracy: 0.8354430379746836
recall: 0.7246376811594203
precision: 0.45871559633027525
f1: 0.5617977528089888
confusion matrix:
[[692 118]
 [ 38 100]]

combination model
accuracy: 0.8354430379746836
recall: 0.7246376811594203
precision: 0.45871559633027525
f1: 0.5617977528089888
confusion matrix:
[[692 118]
 [ 38 100]]

