In [10]:
#Importing all the libraries that will be used
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import sklearn as sk
import warnings
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay, auc
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from scipy import stats

warnings.filterwarnings('ignore')
sns.set_palette("husl")

In [3]:
#Reading the dataset
data = pd.read_csv(r'C:\Users\Samu\Documents\Ironhack\lab-imbalanced-data\files_for_lab\customer_churn.csv')

In [4]:
#Creating the target and independent variables
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
#Training the model
classification = LogisticRegression(random_state=42, max_iter=10000)
classification.fit(X_train, y_train)

In [6]:
#Predicting the model
predictions = classification.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1035
         Yes       0.63      0.45      0.53       374

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



In [19]:
y.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

We can see that the model is biased towards No, as we have more No than Yes, as we can see in the value_counts just above.

In [18]:
#Applying SMOTE to the dataset
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)
classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train, y_train)

predictions = classification.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.88      0.72      0.79      1035
         Yes       0.48      0.72      0.58       374

    accuracy                           0.72      1409
   macro avg       0.68      0.72      0.68      1409
weighted avg       0.77      0.72      0.73      1409



As SMOTE evened the number of Yes and No, the recall is similar, but we have lost precision in the Yes section and a little bit of accuracy. 

In [21]:
#Applying TomekLinks to the independent variables

X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
tomek = TomekLinks()
X_train, y_train = tomek.fit_resample(X, y)

classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train, y_train)

predictions = classification.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.83      0.87      0.85      1035
         Yes       0.58      0.51      0.54       374

    accuracy                           0.77      1409
   macro avg       0.70      0.69      0.70      1409
weighted avg       0.76      0.77      0.77      1409



With TomekLinks, we see again a better result in the recall of Yes, but not so important as in SMOTE, for the rest, it remained almost the same as the first time that we ran the model.