In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("/content/Telco Customer Churn.zip")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#PRE-PROCESSING

In [3]:
df['TotalCharges'].isna().sum()

0

In [6]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(0, inplace=True)

In [9]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [8]:
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})

In [10]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

features = categorical + numerical

#FEATURE ENGINEERING

In [11]:
X = df[features]
y = df['Churn']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()

# Fit and transform the numerical features for training set
X_train[numerical] = scaler.fit_transform(X_train[numerical])

# Transform the numerical features for test set
X_test[numerical] = scaler.transform(X_test[numerical])

X_train = pd.DataFrame(X_train, columns=features)
X_test = pd.DataFrame(X_test, columns=features)


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical),
        ('cat', OneHotEncoder(sparse_output=False), categorical)])

In [19]:
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)

# Get feature names after transformation
num_features = preprocessor.named_transformers_['num'].get_feature_names_out(numerical)
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical)

# Combine numerical and categorical feature names
feature_names = list(num_features) + list(cat_features)

# Convert the transformed data back to DataFrame
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=feature_names)

In [22]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Initialize models with random_state=1
rf_model = RandomForestClassifier(random_state=1)
et_model = ExtraTreesClassifier(random_state=1)
xgb_model = XGBClassifier(random_state=1)
lgbm_model = LGBMClassifier(random_state=1)

# Train the models
rf_model.fit(X_train_transformed_df, y_train)
et_model.fit(X_train_transformed_df, y_train)
xgb_model.fit(X_train_transformed_df, y_train)
lgbm_model.fit(X_train_transformed_df, y_train)

# Make predictions on the test set
rf_pred = rf_model.predict(X_test_transformed_df)
et_pred = et_model.predict(X_test_transformed_df)
xgb_pred = xgb_model.predict(X_test_transformed_df)
lgbm_pred = lgbm_model.predict(X_test_transformed_df)

# Evaluate the models
rf_accuracy = accuracy_score(y_test, rf_pred)
et_accuracy = accuracy_score(y_test, et_pred)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)

# Print the accuracies
print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)
print("LightGBM Accuracy:", lgbm_accuracy)



[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Random Forest Accuracy: 0.7913413768630234
Extra Trees Accuracy: 0.7672107877927609
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8034066713981547


In [23]:
rf_pred = rf_model.predict(X_test_transformed_df)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_pred)

# Print the accuracy
print("Random Forest Accuracy:", rf_accuracy)

Random Forest Accuracy: 0.7913413768630234


In [24]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set using the XGBoost model
xgb_pred = xgb_model.predict(X_test_transformed_df)

# Calculate accuracy
xgb_accuracy = accuracy_score(y_test, xgb_pred)

# Print the accuracy
print("XGBoost Accuracy:", xgb_accuracy)
In

XGBoost Accuracy: 0.7934705464868701


['',
 'import pandas as pd\nimport numpy as np',
 'df = pd.read_csv("/content/Telco Customer Churn.zip")\ndf.head()',
 "df['TotalCharges'].isna().sum()",
 "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')",
 'df.head()',
 "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')\ndf['TotalCharges'].fillna(0, inplace=True)",
 'df.head()',
 "df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})",
 'df.head()',
 "categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',\n               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',\n               'Contract', 'PaperlessBilling', 'PaymentMethod']\nnumerical = ['tenure', 'MonthlyCharges', 'TotalCharges']\n\nfeatures = categorical + numerical",
 "X = df[features]\ny = df['Churn']",
 'from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(

In [25]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set using the LightGBM model
lgbm_pred = lgbm_model.predict(X_test_transformed_df)

# Calculate accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_pred)

# Print the accuracy
print("LightGBM Accuracy:", lgbm_accuracy)


LightGBM Accuracy: 0.8034066713981547


In [26]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Initialize the Extra Trees Classifier
et_model = ExtraTreesClassifier(random_state=1)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=et_model,
    param_distributions=hyperparameter_grid,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit the RandomizedSearchCV
random_search.fit(X_train_transformed_df, y_train)

# Get the best hyperparameters
best_hyperparameters = random_search.best_params_
best_score = random_search.best_score_

# Print the best hyperparameters and the best score
print("Best Hyperparameters:", best_hyperparameters)
print("Best Accuracy Score from CV:", best_score)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}
Best Accuracy Score from CV: 0.7925092316639375


In [27]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# Retrieve the best hyperparameters from RandomizedSearchCV
best_hyperparameters = random_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Initialize a new ExtraTreesClassifier with the best hyperparameters and random_state=1
optimal_et_model = ExtraTreesClassifier(
    n_estimators=best_hyperparameters['n_estimators'],
    min_samples_leaf=best_hyperparameters['min_samples_leaf'],
    min_samples_split=best_hyperparameters['min_samples_split'],
    max_features=best_hyperparameters['max_features'],
    random_state=1
)

# Train the model with the optimal hyperparameters
optimal_et_model.fit(X_train_transformed_df, y_train)

# Make predictions on the test set with the new model
optimal_et_pred = optimal_et_model.predict(X_test_transformed_df)

# Calculate the accuracy of the new model
optimal_et_accuracy = accuracy_score(y_test, optimal_et_pred)
print("Optimal Extra Trees Accuracy:", optimal_et_accuracy)

# Initial Extra Trees Classifier for comparison (assuming it was trained before)
initial_et_model = ExtraTreesClassifier(random_state=1)
initial_et_model.fit(X_train_transformed_df, y_train)
initial_et_pred = initial_et_model.predict(X_test_transformed_df)
initial_et_accuracy = accuracy_score(y_test, initial_et_pred)
print("Initial Extra Trees Accuracy:", initial_et_accuracy)

# Compare the accuracies
if optimal_et_accuracy > initial_et_accuracy:
    print("The new optimal model has higher accuracy.")
elif optimal_et_accuracy < initial_et_accuracy:
    print("The new optimal model has lower accuracy.")
else:
    print("The new optimal model has the same accuracy as the initial model.")


Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}
Optimal Extra Trees Accuracy: 0.8041163946061036
Initial Extra Trees Accuracy: 0.7672107877927609
The new optimal model has higher accuracy.
