<a href="https://colab.research.google.com/github/Srinivas44444/shadx/blob/main/Customer_Churn_Prediciton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# ML Prediction Libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.combine import SMOTETomek

import warnings
warnings.filterwarnings("ignore")


In [None]:
from google.colab import files
uploaded = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
telco_data = pd.read_csv(file_path)

print(telco_data.info())

In [None]:
object_columns = telco_data.select_dtypes(include=['object'])

for col in object_columns.columns:
    print(f"{col} - {object_columns[col].nunique()} Unique Values")
    print(object_columns[col].unique())
    print()

# Convert Yes/No object columns to bool True/False
cols_to_convert = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']

for col in cols_to_convert:
  telco_data[col] = telco_data[col].map({'Yes': True, 'No': False}).astype(bool)

# Convert int 1/0 SeniorCitizen column to bool True/False
telco_data['SeniorCitizen'] = telco_data['SeniorCitizen'].map({1: True, 0: False}).astype(bool)

# Relocate Churn (column  of interest) to front of dataframe for clarity
telco_data.insert(0, 'Churn', telco_data.pop('Churn'))

# Remaining object columns
object_columns = telco_data.select_dtypes(include=['object'])
print("\nRemaining object columns after boolean conversions:")
print(object_columns.columns)

In [None]:
telco_data['TotalCharges'] = telco_data['TotalCharges'].replace(' ', '0').astype(float)

print(telco_data['TotalCharges'].describe())

In [None]:
object_columns = telco_data.select_dtypes(include=["object"])
object_columns = object_columns.drop(columns="customerID")

encoded_object_columns = pd.get_dummies(object_columns)
encoded_telco_data = pd.concat([telco_data.drop(columns=object_columns), encoded_object_columns], axis=1)

# Don't need customer IDs or Contract in encoded dataframe
encoded_telco_data = encoded_telco_data.drop(columns=["customerID"])

In [None]:
# Correlation matrix
corr_mat = encoded_telco_data.corr()

f, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_mat, cmap='coolwarm', square=True)
plt.show()
plt.clf()

In [None]:
# Isolate features
features_df = encoded_telco_data.drop(columns='Churn')

# Creating dataframe containing the correlation of each feature with our target variable
feature_target_corr = pd.DataFrame({'feature': features_df.columns, 'correlation': corr_mat['Churn'][1:]})
feature_target_corr = feature_target_corr.sort_values(by='correlation')

# Plot positive values in red and negative values in blue
f, ax = plt.subplots(figsize=(12,6))
sns.set_style("whitegrid")
sns.barplot(data=feature_target_corr,
             x='feature',
            y='correlation',
            palette='coolwarm')

plt.xticks(rotation=90)
plt.xlabel('Feature')
plt.ylabel('Correlation Strength')
plt.title('Feature Correlation with Churn')
plt.show()
plt.clf()

In [None]:

print(f"Correlation between StreamingTV_No and StreamingMovies_No: " + str(corr_mat["StreamingTV_No"]["StreamingMovies_No"]))
print(f"Correlation between PaperlessBilling and MonthlyCharges: " + str(corr_mat["PaperlessBilling"]["MonthlyCharges"]))

cols_to_drop = [col for col in encoded_telco_data if "No internet service" in col]
encoded_telco_data = encoded_telco_data.drop(columns=cols_to_drop)

In [None]:
print(telco_data.info())

In [None]:
telco_countplots = telco_data.copy()
telco_countplots = telco_data.select_dtypes(exclude=[int, float])
telco_countplots.drop(columns=["customerID"], inplace=True)

f, ax = plt.subplots(4, 5, figsize=(20, 16))
ax = ax.flatten()

for i, col in enumerate(telco_countplots.columns):
    sns.countplot(data=telco_countplots, x=col, ax=ax[i])
    ax[i].set_title(f'Count of {col} for All Customers')
    ax[i].set_xlabel(col)
    ax[i].set_ylabel('Count')

    # Set and rotate x-axis labels
    ax[i].tick_params(axis='x', rotation=30)

# Hide empty subplots if the number of columns is less than 20
for ax in ax[len(telco_countplots.columns):]:
    ax.axis('off')

plt.tight_layout()
plt.show()
plt.clf()

In [None]:
telco_categorical = telco_data.select_dtypes(exclude=[int, float])
telco_categorical.drop(columns=["customerID"], inplace=True)

churn_categorical = telco_categorical[telco_categorical["Churn"] == True]
no_churn_categorical = telco_categorical[telco_categorical["Churn"] == False]

f, ax = plt.subplots(4, 4, figsize=(16, 16))
ax = ax.flatten()

for i, col in enumerate(telco_categorical.drop(columns="Churn").columns):
    # Create temporary dataframe
    telco_pct = pd.DataFrame()

    # Percent distribution of each variable for a given column for non-churned customers
    no_churn_pct = (100 * no_churn_categorical[col].value_counts()/no_churn_categorical.shape[0]).reset_index()
    no_churn_pct.rename(columns={"count": "Non-Churned"}, inplace=True)
    # Do the same for the subset of the population that have churned
    churn_pct = (100 * churn_categorical[col].value_counts()/churn_categorical.shape[0]).reset_index()
    churn_pct.rename(columns={"count": "Churned"}, inplace=True)

    # Merge the two dataframes
    telco_pct = pd.merge(no_churn_pct, churn_pct, on=col)

    # Plot
    telco_pct.plot.bar(x=col, ax=ax[i])
    ax[i].set_title(f'Percent Distribution of {col}')
    ax[i].set_ylabel('Percentage')

    # Set and rotate x-axis labels
    ax[i].tick_params(axis='x', rotation=30)

plt.tight_layout()
plt.show()
plt.clf()

In [None]:
print(telco_data.describe())

In [None]:
telco_nums = telco_data.select_dtypes(include=[int, float])
telco_nums["Churn"] = telco_data["Churn"]

sns.pairplot(data=telco_nums, hue="Churn", plot_kws={'alpha': 0.3})
plt.show()
plt.clf()


In [None]:
telco_less_than_35 = telco_nums[telco_nums["MonthlyCharges"] < 35]

sns.pairplot(data=telco_less_than_35, hue="Churn")
plt.show()
plt.clf()

In [None]:
print("Summary Statistics of Non-Churned Customers")
print(telco_nums[telco_nums["Churn"] == False].describe())

print("\nSummary Statistics of Churned Customers")
print(telco_nums[telco_nums["Churn"] == True].describe())

In [None]:
f, ax = plt.subplots(1, 3, figsize=(12, 4))
ax = ax.flatten()

for i, col in enumerate(telco_nums.drop(columns="Churn").columns):
    sns.boxplot(data=telco_nums, x=col, y=np.array([""]* len(telco_nums)), hue="Churn", ax=ax[i])

plt.tight_layout()
plt.show()
plt.clf()

In [None]:
pip install optuna

In [None]:
import optuna

In [None]:
# Using a robust scaler to better handle outliers
scaler = RobustScaler()
numerical_features = encoded_telco_data.select_dtypes(include=[int, float])
scaled_data = scaler.fit_transform(numerical_features)

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=numerical_features.columns)

# Adding it back to the encoded features
categorical_features = encoded_telco_data.select_dtypes(exclude=[int, float])
encoded_df = categorical_features.join(scaled_df)
#Split features and target column
X_data = encoded_df.drop(columns="Churn")
y_data = encoded_df["Churn"]

classifier_names = ["LogisticRegression",
                    "RandomForestClassifier",
                    "KNeighborsClassifier",
                    "GradientBoostingClassifier",
                    "SVC",
                    "AdaBoostClassifier"]

# Optuna Hyperparameter Tuning

# Define an objective function to be maximized
def objective(trial):
   # Suggest values for the hyperparameters using a trial object
    classifier_name = trial.suggest_categorical("classifier", classifier_names)

    if classifier_name == "LogisticRegression":
        lr_c = trial.suggest_float("lr_c", 1e-2, 10, log=True)
        classifier_obj = LogisticRegression(C=lr_c, max_iter=250)

    elif classifier_name == "RandomForestClassifier":
        rfc_max_depth = trial.suggest_int("rfc_max_depth", 2, 32, log=True)
        rfc_max_samples = trial.suggest_float("rfc_max_samples", 0.2, 1)
        classifier_obj = RandomForestClassifier(max_depth=rfc_max_depth, max_samples=rfc_max_samples, n_estimators=50)

    elif classifier_name == "KNeighborsClassifier":
        knc_n_neighbors = trial.suggest_int("knc_n_neighbors", 3, 10)
        classifier_obj = KNeighborsClassifier(n_neighbors=knc_n_neighbors)

    elif classifier_name == "GradientBoostingClassifier":
        gbc_learning_rate = trial.suggest_float("gbc_learning_rate", 1e-4, 1, log=True)
        classifier_obj = GradientBoostingClassifier(learning_rate=gbc_learning_rate)

    elif classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-2, 10, log=True)
        classifier_obj = SVC(C=svc_c, gamma="auto")

    else:
        abc_learning_rate = trial.suggest_float("abc_learning_rate", 1e-4, 1, log=True)
        classifier_obj = AdaBoostClassifier(learning_rate=abc_learning_rate)

    f1_score = cross_val_score(classifier_obj, X_data, y_data, cv=5, scoring="f1", n_jobs=-1).mean()

    return f1_score
    optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_f1 = study.best_value

print("Best Hyperparameters:", best_params)
print("Best F1 Score:", best_f1)

In [None]:
def metric_report(y_true, y_pred):
    confusion_mat = confusion_matrix(y_true, y_pred)
    accuracy = round(accuracy_score(y_true, y_pred), 3)
    precision = round(precision_score(y_true, y_pred), 3)
    recall = round(recall_score(y_true, y_pred), 3)
    f1 = round(f1_score(y_true, y_pred), 3)

    print("Confusion Matrix")
    print(confusion_mat)
    print(f"\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

lr = LogisticRegression(C=best_params["lr_c"], max_iter=250)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

metric_report(y_test, y_pred)

In [None]:
# Resampling
resample = SMOTETomek(random_state=42)
X_res, y_res = resample.fit_resample(X_data, y_data)

ax = sns.countplot(x=y_res)
ax.bar_label(ax.containers[0])
plt.show()
plt.clf()

In [None]:
classifier_names = ["LogisticRegression",
                    "RandomForestClassifier",
                    "KNeighborsClassifier",
                    "GradientBoostingClassifier",
                    "SVC",
                    "AdaBoostClassifier"]

# Optuna Hyperparameter Tuning

# Define an objective function to be maximized
def objective(trial):
    # Suggest values for the hyperparameters using a trial object
    classifier_name = trial.suggest_categorical("classifier", classifier_names)

    if classifier_name == "LogisticRegression":
        lr_c = trial.suggest_float("lr_c", 1e-2, 10, log=True)
        model = LogisticRegression(C=lr_c, max_iter=500)

    elif classifier_name == "RandomForestClassifier":
        rfc_max_depth = trial.suggest_int("rfc_max_depth", 2, 32, log=True)
        rfc_max_samples = trial.suggest_float("rfc_max_samples", 0.2, 1)
        model = RandomForestClassifier(max_depth=rfc_max_depth, max_samples=rfc_max_samples, n_estimators=50)

    elif classifier_name == "KNeighborsClassifier":
        knc_n_neighbors = trial.suggest_int("knc_n_neighbors", 3, 10)
        model = KNeighborsClassifier(n_neighbors=knc_n_neighbors)

    elif classifier_name == "GradientBoostingClassifier":
        gbc_learning_rate = trial.suggest_float("gbc_learning_rate", 1e-4, 1, log=True)
        model = GradientBoostingClassifier(learning_rate=gbc_learning_rate)

    elif classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-2, 10, log=True)
        model = SVC(C=svc_c, gamma="auto")

    else:
        abc_learning_rate = trial.suggest_float("abc_learning_rate", 1e-4, 1, log=True)
        model = AdaBoostClassifier(learning_rate=abc_learning_rate)

    model.fit(X_res, y_res)
    y_pred = model.predict(X_data)

    f1 = f1_score(y_data, y_pred)

    return f1

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_f1 = study.best_value

print("Best Hyperparameters:", best_params)
print("Best F1 Score:", best_f1)

In [None]:
rfc = RandomForestClassifier(max_depth=best_params["rfc_max_depth"], max_samples=best_params["rfc_max_samples"])
# Training on resampled dataset
rfc.fit(X_res, y_res)
# Checking performance on orginal dataset
y_pred = rfc.predict(X_data)

metric_report(y_data, y_pred)

In [None]:
conf_mat = confusion_matrix(y_data, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Churned', 'Churned'],
            yticklabels=['Non-Churned', 'Churned'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()