In [21]:
import pandas as pd
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [22]:
df = pd.read_csv('./../clean_data.csv')

In [23]:
labels = df.columns[1:]
x=df.loc[:,labels]
y=df.loc[:,'churn']

xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8)

In [24]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(xtrain, ytrain)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after oversampling:
No     3276
Yes    3276
Name: churn, dtype: int64

Class distribution after undersampling:
No     514
Yes    514
Name: churn, dtype: int64


In [25]:


model = KNeighborsClassifier()
params = {
    'n_neighbors': [3,5,7,9,11,13],
    'weights': ['uniform', 'distance'],
    'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
    'p' : [1,2,3,4,5]
    }

f1 = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average='micro')

clf = GridSearchCV(estimator=model,param_grid=params,cv=5,n_jobs=5,verbose=1,scoring=f1)
clf.fit(xtrain,ytrain)


Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [26]:
print(clf.best_params_)
model1 = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'], weights=clf.best_params_['weights'], algorithm=clf.best_params_['algorithm'], p=clf.best_params_['p']) 

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}


In [30]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model, x, y, cv=cv_strategy, scoring='f1_micro')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy:", cross_val_scores.mean())
print("Standard Deviation:", cross_val_scores.std())

Cross-Validation Scores: [0.89240506 0.89767932 0.89873418 0.88067582 0.89968321]
Mean Accuracy: 0.8938355187823863
Standard Deviation: 0.007045439827389475


In [28]:
model1.fit(X_resampled,y_resampled)

yhat = model1.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print(acc)
print(recall)
print(prec)
print(f1)
print(conf_matrix)

0.8333333333333334
0.7727272727272727
0.4434782608695652
0.56353591160221
[[688 128]
 [ 30 102]]
