In [13]:
import pandas as pd
import sklearn.metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [14]:
df = pd.read_csv('../clean_data.csv')

numerical_columns = df.select_dtypes(include=['float64','int']).columns

x = df[numerical_columns]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ros = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_oversampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_undersampled).value_counts())


Class distribution after oversampling:
churn
No     3281
Yes    3281
Name: count, dtype: int64

Class distribution after undersampling:
churn
No     509
Yes    509
Name: count, dtype: int64


In [15]:
svm = SVC(kernel='rbf', C=10, gamma=1, degree=2, probability=True)
nn_accuracy = MLPClassifier(activation= 'relu', alpha= 0.001, batch_size= 32, hidden_layer_sizes= (50, 50), learning_rate= 'adaptive', max_iter= 9999999999, solver= 'adam') 
rf_model = RandomForestClassifier(bootstrap=False, max_depth=30, max_features=1, min_samples_split=5, n_estimators=200)

models = [('svm',svm),('nn',nn_accuracy),('rf',rf_model)]

In [16]:
hard_voting_clf = VotingClassifier(models, voting='hard')

soft_voting_clf = VotingClassifier(models, voting='soft')

In [17]:
hard_voting_clf.fit(X_oversampled,y_oversampled)

yhat = hard_voting_clf.predict(X_test)

acc = sklearn.metrics.accuracy_score(y_test,yhat)
recall = sklearn.metrics.recall_score(y_test,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(y_test,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(y_test,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(y_test,yhat)
print('hard')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

hard
accuracy: 0.9430379746835443
recall: 0.7153284671532847
precision: 0.8672566371681416
f1: 0.7839999999999999
confusion matrix:
[[796  15]
 [ 39  98]]



In [18]:
soft_voting_clf.fit(X_oversampled,y_oversampled)

yhat = soft_voting_clf.predict(X_test)

acc = sklearn.metrics.accuracy_score(y_test,yhat)
recall = sklearn.metrics.recall_score(y_test,yhat,pos_label='Yes')
prec = sklearn.metrics.precision_score(y_test,yhat,pos_label='Yes')
f1 = sklearn.metrics.f1_score(y_test,yhat,pos_label='Yes')
conf_matrix =  sklearn.metrics.confusion_matrix(y_test,yhat)
print('soft')
print(f"accuracy: {acc}")
print(f"recall: {recall}")
print(f"precision: {prec}")
print(f"f1: {f1}")
print(f"confusion matrix:\n{conf_matrix}\n")

soft
accuracy: 0.939873417721519
recall: 0.7299270072992701
precision: 0.8333333333333334
f1: 0.7782101167315175
confusion matrix:
[[791  20]
 [ 37 100]]

