In [22]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import env
import acquire
import prepare

## Acquire

In [2]:
df = acquire.get_telco_data()

In [33]:
# calculate the general churn
basic_churn = df.churn.value_counts()
basic_churn_df = pd.DataFrame(basic_churn) 
basic_churn_df

Unnamed: 0,churn
0,5163
1,1869


In [4]:
# Find general churn rate
churn_rate_bl = 1869/7043
churn_rate_bl

0.2653698707936959

In [5]:
# Determine data types
# Note total_charges is an object
df.dtypes

internet_service_type_id      int64
payment_type_id               int64
contract_type_id              int64
customer_id                  object
gender                       object
senior_citizen                int64
partner                      object
dependents                   object
tenure                        int64
phone_service                object
multiple_lines               object
online_security              object
online_backup                object
device_protection            object
tech_support                 object
streaming_tv                 object
streaming_movies             object
paperless_billing            object
monthly_charges             float64
total_charges                object
churn                        object
contract_type                object
payment_type                 object
internet_service_type        object
dtype: object

In [6]:
# Clean data by converting total_charges to a float and drop null/nan
df = prepare.clean_data(df)

In [7]:
# Encode the churn column
df = prepare.encoder(df)

In [8]:
# Baseline Model
x_bl = df[['tenure', 'monthly_charges', 'total_charges']]
y_bl = df[['churn']]

In [11]:
# Split my data into X (mult. variables), y (target), and into train and test datasets
#X, y, X_train, X_test, y_train, y_test = prepare.split_my_data(df)
X, y, X_train, X_test, y_train, y_test = prepare.split_my_data_bl(df)

In [12]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
clf.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

**Task** I need to select which variables and pick a model and run the data.

In [18]:
# Predict - Estimate churn

y_pred = clf.predict(X_train)

# Estimate the probability of churn

y_pred_proba = clf.predict_proba(X_train)

In [15]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [19]:
confusion_matrix(y_train, y_pred)


array([[3827,  306],
       [ 901,  591]])

In [23]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.93      0.86      4133
           1       0.66      0.40      0.49      1492

    accuracy                           0.79      5625
   macro avg       0.73      0.66      0.68      5625
weighted avg       0.77      0.79      0.77      5625



In [29]:
# function to encode all variables
def encoder_all(df):
    encoder = LabelEncoder()
    data = df.drop(columns=['contract_type', 'internet_service_type','payment_type'])
    data['total_charges'] = pd.to_numeric(data['total_charges'],errors='coerce')
    encode_list = ['gender','partner', 'dependents', 'phone_service','multiple_lines', 'online_security', 'online_backup','device_protection','tech_support','streaming_tv', \
                  'streaming_movies', 'paperless_billing', 'churn']
    for c in encode_list:
        data[c] = encoder.fit_transform(data[c])
    return data

In [30]:
df_encoded = encoder_all(df)

In [31]:
df_encoded.head()

Unnamed: 0,internet_service_type_id,payment_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn
0,1,2,1,0003-MKNFE,1,0,0,0,9,1,...,0,0,0,0,0,2,0,59.9,542.4,0
1,1,4,1,0013-MHZWF,0,0,0,1,9,1,...,0,0,0,2,2,2,1,69.4,571.45,0
2,1,1,1,0015-UOCOJ,0,1,0,0,7,1,...,2,0,0,0,0,0,1,48.2,340.35,0
3,1,1,1,0023-HGHWL,1,1,0,0,1,0,...,0,0,0,0,0,0,1,25.1,25.1,1
4,1,3,1,0032-PGELS,0,0,1,1,1,0,...,2,0,0,0,0,0,0,30.5,30.5,1
