Imports for Project

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
from sqlalchemy.ext.declarative import declarative_base
from config import db_password

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tensorflow import keras

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report, ConfusionMatrixDisplay



In [35]:

engine = sqlalchemy.create_engine(
    sqlalchemy.engine.url.URL(
        drivername="postgresql",
        username="postgres",
        password=db_password,
        host="localhost",
        port="5432",
        database="Final_Project",
    ),
    echo_pool=True,
)
print("connecting with engine " + str(engine))
connection = engine.connect()
query = "select * From Teleco"
df = pd.read_sql_query(query, connection)

connecting with engine Engine(postgresql://postgres:***@localhost:5432/Final_Project)


  


In [36]:
print (df)


      customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0     customerID  gender  SeniorCitizen  Partner  Dependents  tenure   
1     7590-VHVEG  Female              0      Yes          No       1   
2     5575-GNVDE    Male              0       No          No      34   
3     3668-QPYBK    Male              0       No          No       2   
4     7795-CFOCW    Male              0       No          No      45   
...          ...     ...            ...      ...         ...     ...   
7039  6840-RESVB    Male              0      Yes         Yes      24   
7040  2234-XADUH  Female              0      Yes         Yes      72   
7041  4801-JZAZL  Female              0      Yes         Yes      11   
7042  8361-LTMKD    Male              1      Yes          No       4   
7043  3186-AJIEK    Male              0       No          No      66   

      PhoneService     MultipleLines  InternetService  OnlineSecurity  ...  \
0     PhoneService     MultipleLines  InternetService  On

Data Preprocessing

Step 1 of Data Preprocessing: Verify that there is no null values.


In [37]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Description of data and data types

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7044 entries, 0 to 7043
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customerID        7044 non-null   object
 1   gender            7044 non-null   object
 2   SeniorCitizen     7044 non-null   object
 3   Partner           7044 non-null   object
 4   Dependents        7044 non-null   object
 5   tenure            7044 non-null   object
 6   PhoneService      7044 non-null   object
 7   MultipleLines     7044 non-null   object
 8   InternetService   7044 non-null   object
 9   OnlineSecurity    7044 non-null   object
 10  OnlineBackup      7044 non-null   object
 11  DeviceProtection  7044 non-null   object
 12  TechSupport       7044 non-null   object
 13  StreamingTV       7044 non-null   object
 14  StreamingMovies   7044 non-null   object
 15  Contract          7044 non-null   object
 16  PaperlessBilling  7044 non-null   object
 17  PaymentMethod 

Step 2 of Data Preprocessing: Categorical Encoding and Convert Objects to Integers

In [39]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [40]:
df = df.apply(lambda x: object_to_int(x))
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7043,2,2,1,0,73,1,0,2,2,...,0,2,2,2,0,1,4,1585,6531,0
1,5375,0,0,2,1,1,0,2,0,0,...,1,0,0,0,1,2,2,446,2505,1
2,3962,1,0,0,1,28,2,1,0,3,...,3,0,0,0,2,0,3,802,1466,1
3,2564,1,0,0,1,12,2,1,0,3,...,1,0,0,0,1,2,3,740,157,2
4,5535,1,0,0,1,40,0,2,0,3,...,3,3,0,0,2,0,0,570,1400,1


The variable we are trying to predict (our output variable 'y') is Churn

In [41]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

Models Chosen to Test:

Logistic Regression 
Pros = Interpretable and explainable, less prone to overfitting when using regularization, applicable for multi-class predictions.
Cons = Assumes linearity between inputs and outputs

Decision Tree
Pros: Explainable and Interpretable, can handle missing values.
Cons: Prone to overfitting, sensitive to outliers.

XGBoost
Pros: Provides accurate results, captures non linear relationships.
Cons: Hyperparameter tuning can be complex, does not perform well on sparse datasets.



Step 3 of Data Preprocessing: Define your Training Set and Testing Set. We split randomly into training and testing sets, with a 70-30 ratio as seen below! By setting random_state equal to an integer, each execution will provide the same results.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 30)

In [43]:
def evalmetrics(ytest, ypred):
    acc = accuracy_score(ytest, ypred) * 100
    print(classification_report(ytest, ypred))
    print('\n', 'Accuracy = {:0.2f}%.'.format(acc))
    ConfusionMatrixDisplay.from_predictions(ytest, ypred)
    return 

Scaling for Columns with values not in the range of 0 and 1

In [44]:
scaler= StandardScaler()
number_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
X_train[number_cols] = scaler.fit_transform(X_train[number_cols])
X_test[number_cols] = scaler.transform(X_test[number_cols])

In [45]:
models = [
    XGBClassifier(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    
]

In [46]:
for i in models:
    i.fit(X_train, y_train)
    pred_test = i.predict(X_test)
    print ("Accuracy for {} is {}".format(i,metrics.accuracy_score(y_test, pred_test)))



Accuracy for XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None) is 0.7748344370860927
Accuracy for DecisionTreeClassifier() is 0.727530747398297
Accuracy for LogisticRegression() is 0.7743614001892147


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
