In [1]:
import pandas as pd
import sklearn.metrics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import VotingClassifier

In [2]:
df = pd.read_csv('./../clean_data.csv')

In [3]:
labels = df.columns[1:]
x=df.loc[:,labels]
y=df.loc[:,'churn']

xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8)

In [4]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(xtrain, ytrain)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())


Class distribution after oversampling:
churn
No     3281
Yes    3281
Name: count, dtype: int64

Class distribution after undersampling:
churn
No     509
Yes    509
Name: count, dtype: int64


In [5]:
params = {
    'hidden_layer_sizes': [(100,), (50, 50), (25, 25, 25)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [16, 32, 64],
    'max_iter': [9999999999]
    }

clf_accuracy = GridSearchCV(estimator=MLPClassifier(),param_grid=params,cv=5,n_jobs=5,verbose=1,scoring='accuracy')
clf_accuracy.fit(xtrain,ytrain)

print(clf_accuracy.best_params_)
model_accuracy = MLPClassifier(**clf_accuracy.best_params_) 

f1 = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average='micro')

clf_f1 = GridSearchCV(estimator=MLPClassifier(),param_grid=params,cv=5,n_jobs=5,verbose=1,scoring=f1)
clf_f1.fit(xtrain,ytrain)

print(clf_f1.best_params_)
modelf1 = MLPClassifier(**clf_f1.best_params_) 

combi_model = VotingClassifier(estimators=[('acc', model_accuracy), ('f1', modelf1)], voting='soft')

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 16, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 9999999999, 'solver': 'adam'}
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 16, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 9999999999, 'solver': 'adam'}


In [6]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(model_accuracy, x, y, cv=cv_strategy, scoring='accuracy')

print("Cross-Validation Scores:", cross_val_scores)
print("Mean Accuracy:", cross_val_scores.mean())
print("Standard Deviation:", cross_val_scores.std())

Cross-Validation Scores: [0.95253165 0.96202532 0.95991561 0.94931362 0.94825766]
Mean Accuracy: 0.9544087703117551
Standard Deviation: 0.005579559530460228


In [7]:
model_accuracy.fit(X_resampled,y_resampled)

yhat = model_accuracy.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('balanced model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

modelf1.fit(X_resampled,y_resampled)

yhat = modelf1.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('f1 model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

combi_model.fit(X_resampled,y_resampled)

yhat = combi_model.predict(xtest)

acc = sklearn.metrics.accuracy_score(ytest,yhat)
recall = sklearn.metrics.recall_score(ytest,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(ytest,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(ytest,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(ytest,yhat)
print('combination model')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

balanced model
accuracy: 0.8829113924050633
recall: 0.781021897810219
precision: 0.5691489361702128
f1: 0.6584615384615385
confusion matrix:
[[730  81]
 [ 30 107]]

f1 model
accuracy: 0.8248945147679325
recall: 0.8248175182481752
precision: 0.44313725490196076
f1: 0.576530612244898
confusion matrix:
[[669 142]
 [ 24 113]]

combination model
accuracy: 0.8649789029535865
recall: 0.8029197080291971
precision: 0.5213270142180095
f1: 0.6321839080459771
confusion matrix:
[[710 101]
 [ 27 110]]

