In [18]:
#Load and explore the dataset: Analyze the distribution of features, check for missing values, and visualize relationships between features and the target variable (churn).
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Load dataset
data = pd.read_csv('Telco-Customer-Churn.csv')

In [19]:
#Preprocess the data: Handle missing values, convert categorical variables to numeric, and normalize/standardize the features if necessary
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

categorical_columns = data.select_dtypes(include=['object']).columns
categorical_columns = categorical_columns[categorical_columns != 'customerID']

label_encoder = LabelEncoder()

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])


In [20]:
# Split the dataset: Divide the dataset into training and testing sets.
# Split the dataset into features (X) and target (y)
X = data.drop(['customerID', 'Churn'], axis=1)
y = data['Churn']

# Split the data into training and testing sets
test_size = 0.3
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)


In [9]:
# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
#Train classification models: Train various classification models (e.g., logistic regression, KNN, SVM, decision tree, random forest, etc.) on the training dataset.
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)


In [11]:
# Metrics
logreg_accuracy = accuracy_score(y_test, logreg_preds)
logreg_precision = precision_score(y_test, logreg_preds)
logreg_recall = recall_score(y_test, logreg_preds)
logreg_f1 = f1_score(y_test, logreg_preds)

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)

print(f"Logistic Regression - Accuracy: {logreg_accuracy}, Precision: {logreg_precision}, Recall: {logreg_recall}, F1: {logreg_f1}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1}")


Logistic Regression - Accuracy: 0.8106956933270232, Precision: 0.68125, Recall: 0.5696864111498258, F1: 0.6204933586337761
Random Forest - Accuracy: 0.7998106956933271, Precision: 0.6882793017456359, Recall: 0.4808362369337979, F1: 0.5661538461538461


In [12]:
# Hyperparameter tuning
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [23]:
# Feature selection using Recursive Feature Elimination (RFE)
rfe = RFE(estimator=RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# Train an optimized Random Forest classifier on the selected features
rf_optimized = RandomForestClassifier(**best_params)
rf_optimized.fit(X_train_rfe, y_train)
rf_optimized_preds = rf_optimized.predict(X_test_rfe)

# Evaluate the performance of the optimized Random Forest classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_optimized_accuracy = accuracy_score(y_test, rf_optimized_preds)
rf_optimized_precision = precision_score(y_test, rf_optimized_preds)
rf_optimized_recall = recall_score(y_test, rf_optimized_preds)
rf_optimized_f1 = f1_score(y_test, rf_optimized_preds)

print(f"Optimized Random Forest Performance:\nAccuracy: {rf_optimized_accuracy}\nPrecision: {rf_optimized_precision}\nRecall: {rf_optimized_recall}\nF1: {rf_optimized_f1}")


Optimized Random Forest Performance:
Accuracy: 0.7988641741599621
Precision: 0.6651884700665188
Recall: 0.5226480836236934
F1: 0.5853658536585366


In [14]:
#Identifying important features
important_features = pd.Series(rf_optimized.feature_importances_, index=X.columns[rfe.support_])
important_features = important_features.sort_values(ascending=False)

print("\nImportant Features:")
print(important_features)


Important Features:
TotalCharges        0.199083
MonthlyCharges      0.195954
tenure              0.172331
Contract            0.156924
OnlineSecurity      0.079895
TechSupport         0.052843
PaymentMethod       0.047449
InternetService     0.040363
OnlineBackup        0.029561
PaperlessBilling    0.025597
dtype: float64


In [24]:
#Conclusion
print("\nAccording to the evaluation metrics, the Optimized Random Forest model emerges as the most proficient performer.")
print("The primary factors influencing customer churn prediction, as per the analysis, include:")
print(important_features.head(5))


According to the evaluation metrics, the Optimized Random Forest model emerges as the most proficient performer.
The primary factors influencing customer churn prediction, as per the analysis, include:
TotalCharges      0.199083
MonthlyCharges    0.195954
tenure            0.172331
Contract          0.156924
OnlineSecurity    0.079895
dtype: float64
