Imports for Project

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
from sqlalchemy.ext.declarative import declarative_base
from config import db_password

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tensorflow import keras

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report, ConfusionMatrixDisplay



In [9]:

engine = sqlalchemy.create_engine(
    sqlalchemy.engine.url.URL(
        drivername="postgresql",
        username="postgres",
        password=db_password,
        host="localhost",
        port="5432",
        database="Final_Project",
    ),
    echo_pool=True,
)
print("connecting with engine " + str(engine))
connection = engine.connect()
query = "select * From Teleco"
df = pd.read_sql_query(query, connection)

connecting with engine Engine(postgresql://postgres:***@localhost:5432/Final_Project)


  


In [10]:
print (df)


      customerID  gender SeniorCitizen Partner Dependents tenure PhoneService  \
0     7590-VHVEG  Female             0     Yes         No      1           No   
1     5575-GNVDE    Male             0      No         No     34          Yes   
2     3668-QPYBK    Male             0      No         No      2          Yes   
3     7795-CFOCW    Male             0      No         No     45           No   
4     9237-HQITU  Female             0      No         No      2          Yes   
...          ...     ...           ...     ...        ...    ...          ...   
7038  6840-RESVB    Male             0     Yes        Yes     24          Yes   
7039  2234-XADUH  Female             0     Yes        Yes     72          Yes   
7040  4801-JZAZL  Female             0     Yes        Yes     11           No   
7041  8361-LTMKD    Male             1     Yes         No      4          Yes   
7042  3186-AJIEK    Male             0      No         No     66          Yes   

         MultipleLines Inte

Data Preprocessing

Step 1 of Data Preprocessing: Verify that there is no null values.


In [11]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Description of data and data types

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customerID        7043 non-null   object
 1   gender            7043 non-null   object
 2   SeniorCitizen     7043 non-null   object
 3   Partner           7043 non-null   object
 4   Dependents        7043 non-null   object
 5   tenure            7043 non-null   object
 6   PhoneService      7043 non-null   object
 7   MultipleLines     7043 non-null   object
 8   InternetService   7043 non-null   object
 9   OnlineSecurity    7043 non-null   object
 10  OnlineBackup      7043 non-null   object
 11  DeviceProtection  7043 non-null   object
 12  TechSupport       7043 non-null   object
 13  StreamingTV       7043 non-null   object
 14  StreamingMovies   7043 non-null   object
 15  Contract          7043 non-null   object
 16  PaperlessBilling  7043 non-null   object
 17  PaymentMethod 

Step 2 of Data Preprocessing: Categorical Encoding and Convert Objects to Integers

In [13]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [14]:
df = df.apply(lambda x: object_to_int(x))
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5375,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,2,446,2505,0
1,3962,1,0,0,0,28,1,0,0,2,...,2,0,0,0,1,0,3,802,1466,0
2,2564,1,0,0,0,12,1,0,0,2,...,0,0,0,0,0,1,3,740,157,1
3,5535,1,0,0,0,40,0,1,0,2,...,2,2,0,0,1,0,0,570,1400,0
4,6511,0,0,0,0,12,1,0,1,0,...,0,0,0,0,0,1,2,1033,925,1


The variable we are trying to predict (our output variable 'y') is Churn

In [15]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

Models Chosen to Test:

Logistic Regression 
Pros = Interpretable and explainable, less prone to overfitting when using regularization, applicable for multi-class predictions.
Cons = Assumes linearity between inputs and outputs

Decision Tree
Pros: Explainable and Interpretable, can handle missing values.
Cons: Prone to overfitting, sensitive to outliers.

XGBoost
Pros: Provides accurate results, captures non linear relationships.
Cons: Hyperparameter tuning can be complex, does not perform well on sparse datasets.



Step 3 of Data Preprocessing: Define your Training Set and Testing Set. We split randomly into training and testing sets, with a 70-30 ratio as seen below! By setting random_state equal to an integer, each execution will provide the same results.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 30)

In [22]:
def evalmetrics(ytest, ypred):
    acc = accuracy_score(ytest, ypred) * 100
    print(classification_report(ytest, ypred))
    print('\n', 'Accuracy = {:0.2f}%.'.format(acc))
    ConfusionMatrixDisplay.from_predictions(ytest, ypred)
    return 

Scaling for Columns with values not in the range of 0 and 1

In [23]:
scaler= StandardScaler()
number_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
X_train[number_cols] = scaler.fit_transform(X_train[number_cols])
X_test[number_cols] = scaler.transform(X_test[number_cols])

In [24]:
models = [
    XGBClassifier(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    
]

In [25]:
for i in models:
    i.fit(X_train, y_train)
    pred_test = i.predict(X_test)
    print ("Accuracy for {} is {}".format(i,metrics.accuracy_score(y_test, pred_test)))



Accuracy for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None) is 0.7695220066256507
Accuracy for DecisionTreeClassifier() is 0.7150970184571699
Accuracy for LogisticRegression() is 0.7780407004259347
