In [1]:
import pandas as pd 
import sklearn.metrics
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import env
import acquire as acq
import prepare as prp
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier,export_text,plot_tree
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier

----------

In [2]:
# load titanic via acquire.py
df = acq.get_titanic_data('titanic_db')
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df = prp.prep_titanic(df)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,26.0,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,35.0,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,35.0,0,0,8.05,Southampton,1,1,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   pclass                   891 non-null    int64  
 2   sex                      891 non-null    object 
 3   age                      891 non-null    float64
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   embark_town              891 non-null    object 
 8   alone                    891 non-null    int64  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Queenstown   891 non-null    uint8  
 11  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(5), object(2), uint8(3)
memory usage: 65.4+ KB


In [5]:
# Split dataset
train, validate, test = prp.split_data(df,'survived')
train.shape,validate.shape,test.shape

((498, 12), (214, 12), (179, 12))

In [6]:
#Determine drivers of target
train.columns[:-2]

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embark_town', 'alone', 'sex_male'],
      dtype='object')

In [7]:
cat_cols, num_cols = [], []
for col in train.columns[:-2]:
    if train[col].dtype == 'O':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [8]:
cat_cols,num_cols

(['survived',
  'pclass',
  'sex',
  'sibsp',
  'parch',
  'embark_town',
  'alone',
  'sex_male'],
 ['age', 'fare'])

In [9]:
explore_cols = cat_cols + num_cols
explore_cols

['survived',
 'pclass',
 'sex',
 'sibsp',
 'parch',
 'embark_town',
 'alone',
 'sex_male',
 'age',
 'fare']

In [10]:
# Columns to potentially drop include:
#  sibsp and parch-assuming data integrity, then alone = 1 | 0 IS SUFFICIENT
#  unless specific domain knowledge says otherwise....drop Embark Town
#  sex is redundancy of sex_male

# Numerical columns to potentially bin as discretes:
#  both age and fare are potnetial candidates for binning...B/C this is Classification and not regression,
# we are not looking to determine or even use precice, exact values, we just need to know categories

'''
Not that it is surprising, but for categorical dimension except sibsp and parch, the feature class that
contained the largest number of instances also lead to the correspndingly lowest survival rate within
the feature.  For example, survival rate for 3rd class pax was lowest, but they constituted largest portion
of pax, survival rate for male was lower than female, and they also constituted largest portion of pax
within feature. 

'''

'\nNot that it is surprising, but for categorical dimension except sibsp and parch, the feature class that\ncontained the largest number of instances also lead to the correspndingly lowest survival rate within\nthe feature.  For example, survival rate for 3rd class pax was lowest, but they constituted largest portion\nof pax, survival rate for male was lower than female, and they also constituted largest portion of pax\nwithin feature. \n\n'

In [11]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
542,0,3,female,11.0,4,2,31.275,Southampton,0,0,0,1
457,1,1,female,29.699118,1,0,51.8625,Southampton,0,0,0,1
205,0,3,female,2.0,0,1,10.4625,Southampton,0,0,0,1
208,1,3,female,16.0,0,0,7.75,Queenstown,1,0,1,0
485,0,3,female,29.699118,3,1,25.4667,Southampton,0,0,0,1


In [12]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [13]:
train['baseline_pred'] = 0
train.tail(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline_pred
288,1,2,male,42.0,0,0,13.0,Southampton,1,1,0,1,0
484,1,1,male,25.0,1,0,91.0792,Cherbourg,0,1,0,0,0
671,0,1,male,31.0,1,0,52.0,Southampton,0,1,0,1,0
191,0,2,male,19.0,0,0,13.0,Southampton,1,1,0,1,0
582,0,2,male,54.0,0,0,26.0,Southampton,1,1,0,1,0
43,1,2,female,3.0,1,2,41.5792,Cherbourg,0,0,0,0,0
120,0,2,male,21.0,2,0,73.5,Southampton,0,1,0,1,0
728,0,2,male,25.0,1,0,26.0,Southampton,0,1,0,1,0
598,0,3,male,29.699118,0,0,7.225,Cherbourg,1,1,0,0,0
522,0,3,male,29.699118,0,0,7.225,Cherbourg,1,1,0,0,0


In [14]:
pd.crosstab(train.baseline_pred,train.survived) 

survived,0,1
baseline_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,191


- 1 Baseline ACCURACY

In [15]:
# Baseline ACCURACY is 61.6%  
baseline_acc = (train.baseline_pred == train.survived).mean()
baseline_acc

0.6164658634538153

In [16]:
#2 MAKE FIT and USE to Training data

X_cols = train.columns.to_list()
X_cols.remove('survived')
X_cols.remove('baseline_pred')
X_cols.remove('sex')
X_cols.remove('embark_town')
y_cols = 'survived'

X_cols

['pclass',
 'age',
 'sibsp',
 'parch',
 'fare',
 'alone',
 'sex_male',
 'embark_town_Queenstown',
 'embark_town_Southampton']

In [17]:
y_cols

'survived'

In [None]:
# X_train = train[X_cols]
# y_train = train[y_cols]

In [18]:
train[y_cols].head()

542    0
457    1
205    0
208    1
485    0
Name: survived, dtype: int64

In [19]:
# MAKE
clf = DecisionTreeClassifier()
clf

In [20]:
# FIT
clf.fit(train[X_cols],train[y_cols])

In [25]:
model_preds = clf.predict(train[X_cols]) # numpy array
model_preds

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,

In [26]:
model_preds.sum()

188

In [27]:
clf.score(train[X_cols],train[y_cols])

0.9939759036144579

In [28]:
# classification report:
print(
    classification_report(train[y_cols],
                      model_preds))

sklearn.metrics.accuracy_score(train[y_cols],model_preds)

sklearn.metrics.recall_score(train[y_cols],model_preds)

sklearn.metrics.precision_score(train[y_cols],model_preds)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       307
           1       1.00      0.98      0.99       191

    accuracy                           0.99       498
   macro avg       1.00      0.99      0.99       498
weighted avg       0.99      0.99      0.99       498



1.0

In [29]:
# 4 Accuracy, TruePos Rate, FalsePos rate,TrueNeg rate, falseNeg rate, precision, recall, f1, support

pd.crosstab(model_preds,train[y_cols])

#  TP | fP
#  --------
#  fn | Tn

TP = 307
FP = 3
FN = 0
TN = 188
pd.crosstab(model_preds,train[y_cols])

survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,3
1,0,188


In [30]:
sklearn.metrics.accuracy_score(train[y_cols],model_preds)

0.9939759036144579

In [31]:
# TPR == Recall
sklearn.metrics.recall_score(train[y_cols],model_preds)

0.9842931937172775

In [33]:
# FalsePosRate FP / (FP + TN) ...also, this is  1 - TrueNegRate

FP / (FP+TN)

0.015706806282722512

In [34]:
# TrueNegRate
1 - (FP / (FP+TN))

0.9842931937172775

In [35]:
# FalseNegRate FN / (TP+FP)  1- recall

1 - (sklearn.metrics.recall_score(train[y_cols],model_preds))

0.015706806282722474

In [36]:
sklearn.metrics.precision_score(train[y_cols],model_preds)

1.0

In [37]:
sklearn.metrics.recall_score(train[y_cols],model_preds)

0.9842931937172775

In [38]:
sklearn.metrics.f1_score(train[y_cols],model_preds)

0.9920844327176782

In [39]:
print(
    classification_report(train[y_cols],
                      model_preds))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       307
           1       1.00      0.98      0.99       191

    accuracy                           0.99       498
   macro avg       1.00      0.99      0.99       498
weighted avg       0.99      0.99      0.99       498



###  5 Now, run through the above with a diff maxdepth= value.

In [40]:
# MAKE
clf2 = DecisionTreeClassifier(max_depth=3)
clf2

In [41]:
# FIT
clf2.fit(train[X_cols],train[y_cols])

In [None]:
model2_preds = clf2.predict(train[X_cols]) # numpy array
model2_preds

In [None]:
model2_preds.sum()

In [None]:
clf2.score(train[X_cols],train[y_cols])

In [None]:
# classification report for clf2:
print(
    classification_report(train[y_cols],
                      model2_preds))

In [None]:
sklearn.metrics.accuracy_score(train[y_cols],model2_preds)

In [None]:
sklearn.metrics.recall_score(train[y_cols],model2_preds)

In [None]:
sklearn.metrics.precision_score(train[y_cols],model2_preds)

In [None]:
pd.crosstab(model2_preds,train[y_cols])

#  TP | fP
#  --------
#  fn | Tn

TP = 296
FP = 78
FN = 11
TN = 113
pd.crosstab(model2_preds,train[y_cols])

In [None]:
sklearn.metrics.accuracy_score(train[y_cols],model2_preds)

In [None]:
sklearn.metrics.accuracy_score(train[y_cols],model_preds)

In [None]:
# TPR == Recall
sklearn.metrics.recall_score(train[y_cols],model_preds)

In [None]:
# FalsePosRate FP / (FP + TN) ...also, this is  1 - TrueNegRate

FP / (FP+TN)

In [None]:
# TrueNegRate
1 - (FP / (FP+TN))

In [None]:
# FalseNegRate FN / (TP+FP)  1- recall

1 - (sklearn.metrics.recall_score(train[y_cols],model_preds))

In [None]:
sklearn.metrics.precision_score(train[y_cols],model_preds)

In [None]:
sklearn.metrics.recall_score(train[y_cols],model_preds)

In [None]:
sklearn.metrics.f1_score(train[y_cols],model_preds)

### 6 Which model Peformed better on the TRAIN data?

THE OVERFITTED VERSION with ACCURACY on TRAIN of 99%


In [None]:
# classification report for clf:  99% Accuracy
print(
    classification_report(train[y_cols],
                      model_preds))

In [None]:
# classification report for clf2 (maxdepth=3):  82% Accuracy
print(
    classification_report(train[y_cols],
                      model2_preds))

### 7 Which model Peformed better on the VALIDATE data?

THE maxDepth=3 VERSION with ACCURACY on VALIDATE of 78%, even more significant is the relative
smoothness of the dropoff from 82% (Train) to 78% (Validate)


In [None]:
# Define Validation cols
X_validate, y_validate = validate[X_cols], validate[y_cols]

In [None]:
# VALIDATION for OVERFIT no maxDepth
round(clf.score(X_validate, y_validate), 5)

# Notice the significant dropoff in ACCURACY from 99% TRAIN to 76% VALIDATE

In [None]:
# VALIDATION for maxDepth=3
round(clf2.score(X_validate, y_validate), 5)

# Notice the less significant dropoff in ACCURACY from 82% TRAIN to 78% VALIDATE

### Let's EXPLORE TELCO now

In [None]:
# load titanic via acquire.py
df = acq.new_telco_data()
df.head()

In [None]:
df = prp.prep_telco(df)
df.head()

In [None]:
df.iloc[:,17:33]

In [None]:
df.info()

In [None]:
bad_cols = [
    'customer_id',
    'churn_Yes',
    'internet_service_type_Fiber optic',
    'internet_service_type_None',
    'contract_type_One year',
    'contract_type_Two year',
    'payment_type_Credit card (automatic)',
    'payment_type_Electronic check',
    'payment_type_Mailed check'
]

In [None]:
df.columns

In [None]:
df.drop(columns=bad_cols,inplace=True)
df.head()

In [None]:
df.drop(columns='senior_citizen',inplace=True)
df.head()

In [None]:
# Split dataset
train, validate, test = prp.split_data(df,'churn')
train.shape,validate.shape,test.shape

In [None]:
train.columns

In [None]:
cat_cols, num_cols = [], []
for col in train.columns.to_list():
    if train[col].dtype == 'O':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [None]:
cat_cols,num_cols

In [None]:
explore_cols = cat_cols + num_cols
explore_cols

In [None]:
train.churn.value_counts()

In [None]:
train['baseline_pred'] = 'Yes'
train.head(10)

In [None]:
pd.crosstab(train.baseline_pred,train.churn) 

In [None]:
# Baseline ACCURACY is 26.5%  
baseline_acc = (train.baseline_pred == train.churn).mean()
baseline_acc

In [None]:
# 2 MAKE FIT and USE to Training data

X_cols = train.columns.to_list()
X_cols.remove('churn')
X_cols.remove('gender')
X_cols

In [None]:
y_cols = ['churn']
y_cols

In [None]:
train[y_cols].head()

In [None]:
# MAKE
clf = DecisionTreeClassifier()
clf

In [None]:
# FIT
clf.fit(train[X_cols],train[y_cols])

In [None]:
model_preds = clf.predict(train[X_cols]) # numpy array
model_preds

model_preds.sum()

clf.score(train[X_cols],train[y_cols])



# classification report:
print(
    classification_report(train[y_cols],
                      model_preds))

sklearn.metrics.accuracy_score(train[y_cols],model_preds)

sklearn.metrics.recall_score(train[y_cols],model_preds)

sklearn.metrics.precision_score(train[y_cols],model_preds)

- 4 Accuracy, TruePos Rate, FalsePos rate,TrueNeg rate, falseNeg rate, precision, recall, f1, support

pd.crosstab(model_preds,train[y_cols])

#  TP | fP
#  --------
#  fn | Tn

TP = 307
FP = 3
FN = 0
TN = 188
pd.crosstab(model_preds,train[y_cols])

sklearn.metrics.accuracy_score(train[y_cols],model_preds)

# TPR == Recall
sklearn.metrics.recall_score(train[y_cols],model_preds)

# FalsePosRate FP / (FP + TN) ...also, this is  1 - TrueNegRate

FP / (FP+TN)

# TrueNegRate
1 - (FP / (FP+TN))

# FalseNegRate FN / (TP+FP)  1- recall

1 - (sklearn.metrics.recall_score(train[y_cols],model_preds))

sklearn.metrics.precision_score(train[y_cols],model_preds)

sklearn.metrics.recall_score(train[y_cols],model_preds)

sklearn.metrics.f1_score(train[y_cols],model_preds)

print(
    classification_report(train[y_cols],
                      model_preds))