In [20]:
import pandas as pd
import sklearn.metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [21]:
results= [['name','accuracy','recall','prec','f1','mean']]
def evaluate_model(model,name,X_test,y_test,dec_number):
    yhat = model.predict(X_test)

    acc = round(sklearn.metrics.accuracy_score(y_test,yhat),dec_number)
    recall = round(sklearn.metrics.recall_score(y_test,yhat,pos_label='Yes'),dec_number)
    prec = round(sklearn.metrics.precision_score(y_test,yhat,pos_label='Yes'),dec_number)
    f1 = round(sklearn.metrics.f1_score(y_test,yhat,pos_label='Yes'),dec_number)
    mean = round((acc+recall+prec+f1)/4,dec_number)

    results.append([name,acc,recall,prec,f1,mean])
    
    
def print_results(results):
    # Print each row
    for row in results:
        row_str = " | ".join(str(element) for element in row)
        print(row_str)

In [22]:
df = pd.read_csv('../clean_data.csv')

numerical_columns = df.select_dtypes(include=['float64','int']).columns

x = df[numerical_columns]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ros = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_oversampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_undersampled).value_counts())


Class distribution after oversampling:
churn
No     3281
Yes    3281
Name: count, dtype: int64

Class distribution after undersampling:
churn
No     509
Yes    509
Name: count, dtype: int64


In [23]:
svm = SVC(kernel='rbf', C=10, gamma=1, degree=2, probability=True)
nn_accuracy = MLPClassifier(activation= 'relu', alpha= 0.001, batch_size= 32, hidden_layer_sizes= (50, 50), learning_rate= 'adaptive', max_iter= 9999999999, solver= 'adam') 
rf_model = RandomForestClassifier(bootstrap=False, max_depth=30, max_features=1, min_samples_split=5, n_estimators=200)

# Model combination

Now we will analyze if combining the results of models can give us any improvement in performence

In [24]:
models = [('svm',svm),('nn',nn_accuracy),('rf',rf_model)]

hard_voting_clf = VotingClassifier(models, voting='hard')

soft_voting_clf = VotingClassifier(models, voting='soft')

In [25]:
hard_voting_clf.fit(X_oversampled,y_oversampled)

evaluate_model(hard_voting_clf,'hard_voting_clf',X_test,y_test,3)


In [26]:
soft_voting_clf.fit(X_oversampled,y_oversampled)
evaluate_model(soft_voting_clf,'soft_voting_clf',X_test,y_test,3)



In [27]:
print_results(results)

name | accuracy | recall | prec | f1 | mean
hard_voting_clf | 0.944 | 0.723 | 0.868 | 0.789 | 0.831
soft_voting_clf | 0.941 | 0.73 | 0.84 | 0.781 | 0.823


Analyzing the results we can see that combining the results of our best performing models does improve the our results

|name | accuracy | recall | prec | f1 | mean|
|----|-----|----|----|----|----|
|svm | 0.926 | 0.583 | 0.813 | 0.679 | 0.75|
|nn_accuracy | 0.925 | 0.685 | 0.737 | 0.71 | 0.764|
|rf_model | 0.912 | 0.354 | 0.978 | 0.52 | 0.691|
|hard_voting_clf | 0.944 | 0.723 | 0.868 | 0.789 | 0.831|
|soft_voting_clf | 0.941 | 0.73 | 0.84 | 0.781 | 0.823|


As we can see there is an improvement of over 6% from our previously best model and every metric improved