In [1]:
# DS Libraries
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# knn submodules from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

# Data Acquisition
from pydataset import data
import env
import acquire as acq
import prepare as prp

In [2]:
# load telco via acquire.py
df = acq.new_telco_data()
df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,3,1,3,2162-FRZAA,Male,0,Yes,Yes,63,No,...,Yes,No,No,No,39.35,2395.05,No,Two year,DSL,Bank transfer (automatic)
1,4,1,3,2160-GPFXD,Male,0,Yes,Yes,54,Yes,...,Yes,No,No,Yes,65.65,3566.7,No,Two year,DSL,Credit card (automatic)
2,2,1,2,2157-MXBJS,Male,0,Yes,No,13,Yes,...,No,Yes,Yes,Yes,75.3,989.45,Yes,One year,DSL,Mailed check
3,4,1,1,2155-AMQRX,Female,0,No,No,28,Yes,...,Yes,No,No,Yes,54.9,1505.15,No,Month-to-month,DSL,Credit card (automatic)
4,3,1,2,2150-WLKUW,Female,0,Yes,No,40,Yes,...,No,Yes,No,No,63.9,2635.0,No,One year,DSL,Bank transfer (automatic)


In [4]:
df.payment_type_id.value_counts(normalize=True)

1    0.335794
2    0.228880
3    0.219225
4    0.216101
Name: payment_type_id, dtype: float64

In [15]:
df.columns

Index(['payment_type_id', 'internet_service_type_id', 'contract_type_id',
       'customer_id', 'gender', 'senior_citizen', 'partner', 'dependents',
       'tenure', 'phone_service', 'multiple_lines', 'online_security',
       'online_backup', 'device_protection', 'tech_support', 'streaming_tv',
       'streaming_movies', 'paperless_billing', 'monthly_charges',
       'total_charges', 'churn', 'contract_type', 'internet_service_type',
       'payment_type'],
      dtype='object')

In [20]:
for col in df.columns:
    print(df[col].value_counts())

1    2365
2    1612
3    1544
4    1522
Name: payment_type_id, dtype: int64
2    3096
1    2421
3    1526
Name: internet_service_type_id, dtype: int64
1    3875
3    1695
2    1473
Name: contract_type_id, dtype: int64
2162-FRZAA    1
5680-LQOGP    1
5640-CAXOA    1
5641-DMBFJ    1
5649-TJHOV    1
             ..
4660-IRIBM    1
4670-TABXH    1
4673-KKSLS    1
4676-MQUEA    1
8909-BOLNL    1
Name: customer_id, Length: 7043, dtype: int64
Male      3555
Female    3488
Name: gender, dtype: int64
0    5901
1    1142
Name: senior_citizen, dtype: int64
No     3641
Yes    3402
Name: partner, dtype: int64
No     4933
Yes    2110
Name: dependents, dtype: int64
1     613
72    362
2     238
3     200
4     176
71    170
5     133
7     131
8     123
9     119
70    119
12    117
10    116
6     110
13    109
68    100
11     99
15     99
67     98
18     97
69     95
24     94
22     90
66     89
35     88
17     87
23     85
16     80
56     80
64     80
52     80
25     79
26     79
14     76
6

In [21]:
for col in df.columns:
    print(df[col].value_counts(normalize=True))

1    0.335794
2    0.228880
3    0.219225
4    0.216101
Name: payment_type_id, dtype: float64
2    0.439585
1    0.343746
3    0.216669
Name: internet_service_type_id, dtype: float64
1    0.550192
3    0.240664
2    0.209144
Name: contract_type_id, dtype: float64
2162-FRZAA    0.000142
5680-LQOGP    0.000142
5640-CAXOA    0.000142
5641-DMBFJ    0.000142
5649-TJHOV    0.000142
                ...   
4660-IRIBM    0.000142
4670-TABXH    0.000142
4673-KKSLS    0.000142
4676-MQUEA    0.000142
8909-BOLNL    0.000142
Name: customer_id, Length: 7043, dtype: float64
Male      0.504756
Female    0.495244
Name: gender, dtype: float64
0    0.837853
1    0.162147
Name: senior_citizen, dtype: float64
No     0.516967
Yes    0.483033
Name: partner, dtype: float64
No     0.700412
Yes    0.299588
Name: dependents, dtype: float64
1     0.087037
72    0.051399
2     0.033792
3     0.028397
4     0.024989
71    0.024137
5     0.018884
7     0.018600
8     0.017464
9     0.016896
70    0.016896
12    0.016