In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE



In [6]:
# Load dataset
data = pd.read_csv("/Users/mac/Desktop/Africa Data School/None/WA_Fn-UseC_-Telco-Customer-Churn.csv")



In [7]:
# Preprocessing
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

for column in data.select_dtypes(include=['object']):
    if column != 'customerID':
        data[column] = LabelEncoder().fit_transform(data[column])



In [8]:
# Splitting dataset
X = data.drop(['customerID', 'Churn'], axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [9]:
# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [10]:
# Training and evaluating models
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)



In [11]:
# Metrics
logreg_accuracy = accuracy_score(y_test, logreg_preds)
logreg_precision = precision_score(y_test, logreg_preds)
logreg_recall = recall_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)

print(f"Logistic Regression - Accuracy: {logreg_accuracy}, Precision: {logreg_precision}, Recall: {logreg_recall}, F1: {logreg_f1}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1}")



Logistic Regression - Accuracy: 0.8106956933270232, Precision: 0.68125, Recall: 0.5696864111498258, F1: 0.6204933586337761
Random Forest - Accuracy: 0.792238523426408, Precision: 0.6726342710997443, Recall: 0.45818815331010454, F1: 0.5450777202072539


In [12]:
# Hyperparameter tuning
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_



In [13]:
# Feature selection
rfe = RFE(RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

rf_optimized = RandomForestClassifier(**best_params)
rf_optimized.fit(X_train_rfe, y_train)
rf_optimized_preds = rf_optimized.predict(X_test_rfe)

rf_optimized_accuracy = accuracy_score(y_test, rf_optimized_preds)
rf_optimized_precision = precision_score(y_test, rf_optimized_preds)
rf_optimized_recall = recall_score(y_test, rf_optimized_preds)
rf_optimized_f1 = f1_score(y_test, rf_optimized_preds)

print(f"Optimized Random Forest - Accuracy: {rf_optimized_accuracy}, Precision: {rf_optimized_precision}, Recall: {rf_optimized_recall}, F1: {rf_optimized_f1}")




Optimized Random Forest - Accuracy: 0.7941315664931378, Precision: 0.6534216335540839, Recall: 0.5156794425087108, F1: 0.5764362220058422


In [14]:
#Identifying important features
important_features = pd.Series(rf_optimized.feature_importances_, index=X.columns[rfe.support_])
important_features = important_features.sort_values(ascending=False)

print("\nImportant Features:")
print(important_features)





Important Features:
MonthlyCharges      0.198672
TotalCharges        0.197832
tenure              0.176419
Contract            0.149017
OnlineSecurity      0.082434
TechSupport         0.050443
PaymentMethod       0.049571
InternetService     0.039852
OnlineBackup        0.029317
PaperlessBilling    0.026445
dtype: float64


In [16]:
#Conclusion
print("\nBased on the evaluation metrics, the Optimized Random Forest model is the best-performing model.")
print("The top features contributing to customer churn prediction are:")
print(important_features.head(5))



Based on the evaluation metrics, the Optimized Random Forest model is the best-performing model.
The top features contributing to customer churn prediction are:
MonthlyCharges    0.198672
TotalCharges      0.197832
tenure            0.176419
Contract          0.149017
OnlineSecurity    0.082434
dtype: float64
