<a href="https://colab.research.google.com/github/PETEROA/Customer_Churn/blob/main/Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd


In [None]:
df = pd.read_csv('Telco-Customer-Churn.csv')

In [None]:
df.head


In [None]:
column_name = 'TotalCharges'

In [None]:
# Utilize pandas to_numeric function to convert the TotalCharges column to numeric values
df[column_name] = pd.to_numeric(df[column_name], errors='coerce')


In [None]:
df

In [None]:
# Drop two columns
columns_to_drop = ['customerID', 'PhoneService']
df = df.drop(columns=columns_to_drop)

In [None]:
df

In [None]:
# Specify the column you want to fill missing values in
column_name = 'TotalCharges'

# Fill missing values in the specified column with 0
df[column_name].fillna(0, inplace=True)

In [None]:
#convert the Churn Column (labeled Yes & No) to binary values
column_name = 'Churn'

In [None]:
# Define a mapping dictionary for "No" and "Yes"
mapping = {"No": 0, "Yes": 1}


In [None]:
# Use the map function to perform the conversion
df[column_name] = df[column_name].map(mapping)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'], axis=1)  # Replace with your target column names

In [None]:
# Specify your target columns
y1 = df['tenure']
y2 = df['MonthlyCharges']
y3 = df['TotalCharges']

# Split the dataset into training and testing sets for each target column (80% train, 20% test) with random_state=1
X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(X, y1, y2, y3, test_size=0.2, random_state=1)




In [None]:
df.dropna()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Scale selected numerical columns
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create a DataFrame with only the selected numerical columns
numerical_data = df[numerical_columns]

# Initialize the StandardScaler
scaler = StandardScaler()

In [None]:
# Fit and transform the numerical data
scaled_data = scaler.fit_transform(numerical_data)

# Convert the scaled data back to a DataFrame with original column names
scaled_df = pd.DataFrame(scaled_data, columns=numerical_columns)

# Update the original dataset with the scaled values
df[numerical_columns] = scaled_df

In [None]:
pip install -U scikit-learn




In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [None]:
#  one-hot encode selected categorical columns
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']  # Replace with your categorical column names


In [None]:
# Create a DataFrame with only the selected categorical columns
categorical_data = df[categorical_columns]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the categorical data
encoded_data = encoder.fit_transform(categorical_data)

# Get the feature names after one-hot encoding
feature_names = encoder.get_feature_names_out(input_features=categorical_columns)

# Convert the encoded data back to a DataFrame with column names
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

# Update the original dataset with the one-hot encoded values
data = pd.concat([df, encoded_df], axis=1)

# Drop the original categorical columns if needed
data.drop(categorical_columns, axis=1, inplace=True)






In [None]:
# Specify your target column 'Churn'
target_column = 'Churn'

# Separate the target variable from the features
X = data.drop(target_column, axis=1)
y = data[target_column]

# Split the data into training and testing sets (80% train, 20% test) with a random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Specify numerical and categorical columns
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_data #= ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines','InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod' ]

# Combine the numerical and categorical features for train and test sets
X_train_combined = pd.concat([X_train[numerical_columns], X_train[categorical_data]], axis=1)
X_test_combined = pd.concat([X_test[numerical_columns], X_test[categorical_data]], axis=1)


In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score

In [None]:
columns_with_missing_data = df.columns[df.isna().any()].tolist()

print("Columns with missing data:", columns_with_missing_data)

Columns with missing data: []


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb


# Exclude non-numeric columns from the feature set
numeric_columns = df.select_dtypes(include=['int', 'float']).columns
X = df[numeric_columns]

# Identify and encode categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

xgb_model = xgb.XGBClassifier(booster='gbtree', enable_categorical=True)
xgb_model.set_params(enable_categorical=True)  # Alternatively, set it using set_params
# Train XGBoost model
xgb_model = XGBClassifier(random_state=1)
xgb_model.fit(X_train, y_train)

# Train LightGBM model
lgbm_model = LGBMClassifier(random_state=1)
lgbm_model.fit(X_train, y_train)

# Make predictions using both models on the test set
xgb_predictions = xgb_model.predict(X_test)
lgbm_predictions = lgbm_model.predict(X_test)

# Evaluate the models using accuracy
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)

print("XGBoost Model Accuracy:", xgb_accuracy)
print("LightGBM Model Accuracy:", lgbm_accuracy)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 666
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
XGBoost Model Accuracy: 0.7934705464868701
LightGBM Model Accuracy: 0.8133427963094393


In [None]:
# Create and train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(X_train, y_train)

# Create and train an Extra Trees classifier
et_classifier = ExtraTreesClassifier(random_state=1)
et_classifier.fit(X_train, y_train)

# Make predictions with both models on the test set
rf_predictions = rf_classifier.predict(X_test)
et_predictions = et_classifier.predict(X_test)

# Evaluate the models using accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
et_accuracy = accuracy_score(y_test, et_predictions)

print("Random Forest Classifier Accuracy:", rf_accuracy)
print("Extra Trees Classifier Accuracy:", et_accuracy)


Random Forest Classifier Accuracy: 0.7970191625266146
Extra Trees Classifier Accuracy: 0.7714691270404542


In [None]:
import pickle

In [None]:
# Create instances of multiple trained models
rf_model = RandomForestClassifier()
ext_model = ExtraTreesClassifier()
xgb_model = XGBClassifier()
lgb_model = LGBMClassifier()

# Save each model to a separate file
models_to_save = [rf_model, ext_model, xgb_model, lgbm_model]
model_filenames = ["rf_model.pkl", "ext_model.pkl", "xgb_model.pkl", "lgbm_model"]

for model, filename in zip(models_to_save, model_filenames):
    with open(filename, "wb") as file:
        pickle.dump(model, file)
