Let us inport all the libraries required for this project

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

Matplotlib is building the font cache; this may take a moment.


Let us get the dataset, this time I will use a url instead of saving the dataset

In [None]:
#We will use the Telco Customer churn dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/telco-customer-churn/telco_customer_churn.csv'
df = pd.read_csv(url)

#This will disply the first rows of code
df.head()

In the next step we will check the missing values


In [None]:
df.isnull().sum()

If there are any missing values, this line of code will remove the rows with the missing values

In [None]:
df.dropna(inplace=True)

Now we convert categorical variable to numerical values using 1-hot encoding

In [None]:
df=pd.get_dummies(df,drop_first=True)

Let's split the  dataset into Training and Testing sets

In [None]:
X = df.drop('Churn_Yes',axis=1)
y = df['Churn_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

Let's get a basic feel for the data using exploratory data analysis

In [None]:
sns.countplot(x='Churn', data=df)
plt.show()

#Distribution of numerical features
dfhist(bins=30, figsize= (12,8))
sns.heatmap(df.corr(), annot=True,fmt='.2f',cmap='coolwarm')
plt.show()

Hear we build the Logistic Regression Model

In [None]:
#We will fit the logistic regression model using the training data
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

We will use the model to make predictions and evaluate performance

In [None]:
#Predict churn on the test data
y_pred=model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

We evaluate the model using metrics and visualization methods

In [None]:
#Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

#Classification Report
cr= classification_report(y_test,y_pred)
print("Classification Report:\n", cr)

#ROC Curve
fpr, tpr, thresholds= roc_curve(ytest, ypred_proba)
plt.plot(fpr,tpr,marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Potitive Rate')
plt.title('ROC Curve')
plt.show()

#AUC Score
auc = roc_auc_score(y_test,y_pred_proba)
print(f"AUC Score:{auc}") #higher auc score indicates better model performance

Potential Improvements


#Feature engineering, we will try out tenure buckets
df['tenure_bucket'] = pd.cut(df['tenure'], bins=[0,12,24,36,48,60,72], labels=False)

In [None]:
#We will experiment with the classification algorithm Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)
y_pred_rf = rf_model.predict(X_test)



In [18]:
#we will use Hyperparameter tuning using Grid Search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1,1,10,100],
    'solver':[lbfgs',liblinear']
}
grid_search = 
GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring= 'roc_auc')
grid_search.fit(X_train,y_train,y_train)
best_model=grid_search.best_estimator_

We will handle any class imbalance using SMOTE(Synthetic Minority Over-sampling Technique

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled= smote.fit_resampled(X_train,y_train)
model.fit(X_resampled,y_resampled)

We will use cross-validation to ensure that the mode's performance is consistent accross different sibsets of data

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model,X,y, cv=5, scoring= 'roc_auc')
print(f'Cross-validated AUC scores:{cv_scores}')
print(f"Mean cross-validated AUC score: {np.mean(cv_scores)}")