<a href="https://colab.research.google.com/github/WMinerva292/WMinerva292/blob/main/11Oct_GradientBoosting_Dev.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<hr>

# **Gradient Boosting**

<hr>

<hr>

# **Step 1 - Load the libraries**

**Configuration Libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

**Basic Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
plt.style.use('dark_background')

**Machine Learning Libraries**

In [None]:
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

<hr>

# **Step 2 - Data Loading and Inspection Phase**

In [None]:
df = pd.read_csv("customer_churn.csv")

**Data Inspection**

In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<hr>

**Data Information**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df["TotalCharges"].value_counts()

Unnamed: 0_level_0,count
TotalCharges,Unnamed: 1_level_1
,11
20.2,11
19.75,9
20.05,8
19.9,8
...,...
6849.4,1
692.35,1
130.15,1
3211.9,1


In [None]:
# df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Replace command
df["TotalCharges"] = df["TotalCharges"].replace(" ", 0)

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"])

<hr>

**Data Information**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


<hr>

# **Step 3 - Data Preprocessing**

**Null Values Inspection**

In [None]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
df = df.dropna()

**Duplicate Analysis**

In [None]:
df.duplicated().sum()

0

<hr>

**Encoding**

In [None]:
encoder = LabelEncoder()

In [None]:
for x in df.columns:
  if df[x].dtype == 'object':
    df[x] = encoder.fit_transform(df[x])

In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5375,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,2,29.85,29.85,0
1,3962,1,0,0,0,34,1,0,0,2,...,2,0,0,0,1,0,3,56.95,1889.5,0
2,2564,1,0,0,0,2,1,0,0,2,...,0,0,0,0,0,1,3,53.85,108.15,1
3,5535,1,0,0,0,45,0,1,0,2,...,2,2,0,0,1,0,0,42.3,1840.75,0
4,6511,0,0,0,0,2,1,0,1,0,...,0,0,0,0,0,1,2,70.7,151.65,1


In [None]:
df = df.drop("customerID", axis = 1)

**Feature Division**

In [None]:
# Feature Data
X = df.drop("Churn", axis = 1)

# Target data
Y = df["Churn"]

**Data Division**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 43)

<hr>

# **Step 5 - Model Building**

<hr>

**Iteration 1**

In [None]:
model = GradientBoostingClassifier()

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.7998106956933271


<hr>

**Iteration 2**

In [None]:
model1 = GradientBoostingClassifier(learning_rate = 0.01)

model1.fit(x_train, y_train)

pred = model1.predict(x_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.7747278750591576


<hr>

**Iteration 3**

In [None]:
model3 = GradientBoostingClassifier(learning_rate = 0.05)

model3.fit(x_train, y_train)

pred = model3.predict(x_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.7998106956933271


<hr>

**Iteration 3**

In [None]:
model4 = GradientBoostingClassifier(learning_rate = 0.1)

model4.fit(x_train, y_train)

pred = model4.predict(x_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.7993374349266446


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    "learning_rate" : [0.01, 0.02, 0.03, 0.05],
    "n_estimators" : [100, 50, 120],
    "max_depth" : [3, 5, 7, 9]
}

In [None]:
model5 = GradientBoostingClassifier()

In [None]:
grid = GridSearchCV(estimator = model5, param_grid = params, cv = 5, verbose = 3)

In [None]:
grid.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.777 total time=   0.9s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.772 total time=   0.8s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.766 total time=   0.9s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.784 total time=   1.1s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.769 total time=   0.8s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.735 total time=   0.4s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.735 total time=   0.4s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.735 total time=   0.4s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.735 total time=   0.4s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.734 to

In [None]:
grid.best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 120}

In [None]:
model = grid.best_estimator_

In [None]:
model

In [None]:
model.predict(data)

In [None]:
model6 = GradientBoostingClassifier(learning_rate = 0.05, n_estimators = 120, max_depth = 3)

model6.fit(x_train, y_train)

pred = model6.predict(x_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.8021769995267393
