In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset into a pandas DataFrame
dataset_path = r'C:\Users\tahsi\OneDrive\Desktop\python_ws\telco-churn.csv'
df = pd.read_csv(dataset_path)

# Check the columns in the dataset
print("Columns in the dataset:", df.columns)

# Define features and target
# Assuming 'Churn' is the target variable and 'customerID' is an ID column to be dropped
X = df.drop(columns=['customerID', 'Churn'], errors='ignore')
y = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Convert target to binary (0 and 1)

# Handle categorical variables if any
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Gradient Boosting
param_grid_gbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

# Initialize the Gradient Boosting model
gbm = GradientBoostingClassifier(random_state=42)

# Initialize Grid Search for Gradient Boosting
grid_search_gbm = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the Grid Search model
grid_search_gbm.fit(X_train, y_train)

# Get the best parameters and best score
best_params_gbm = grid_search_gbm.best_params_
best_score_gbm = grid_search_gbm.best_score_

print(f'Best Parameters for GBM: {best_params_gbm}')
print(f'Best Score for GBM: {best_score_gbm}')

# Train the model with the best parameters
best_gbm = grid_search_gbm.best_estimator_
best_gbm.fit(X_train, y_train)

# Predict on the test set
y_pred_gbm = best_gbm.predict(X_test)

# Evaluate the model
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
print(f'Gradient Boosting Accuracy: {accuracy_gbm}')
print('Gradient Boosting Classification Report:')
print(classification_report(y_test, y_pred_gbm))
print('Gradient Boosting Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_gbm))


Columns in the dataset: Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
Best Parameters for GBM: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 50}
Best Score for GBM: 0.8022707608025834
Gradient Boosting Accuracy: 0.8126330731014905
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      1036
           1       0.69      0.53      0.60       373

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Gradient Boosting Confusion Matrix:
[[949  87]
 [177 196]