In [1]:
# imports
import pandas as pd
import numpy as np
from env import host, user, password
import acquire
import prepare

In [2]:
# use my functions to acquire telco_churn 
telco_df = acquire.get_telco_data()

In [3]:
# let's see what we're working with...
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [4]:
telco_df['customer_id'].nunique()

7043

In [5]:
# reset the customer_id to be the index
telco_df = telco_df.set_index('customer_id')

In [6]:
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 0016-QLJIS to 9986-BONCE
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   gender                    7043 non-null   object 
 4   senior_citizen            7043 non-null   int64  
 5   partner                   7043 non-null   object 
 6   dependents                7043 non-null   object 
 7   tenure                    7043 non-null   int64  
 8   phone_service             7043 non-null   object 
 9   multiple_lines            7043 non-null   object 
 10  online_security           7043 non-null   object 
 11  online_backup             7043 non-null   object 
 12  device_protection         7043 non-null   object 
 13  tech_support              7043 non-null   object 
 14

In [None]:
# are there any null values / whitespace?
# This replaces empty cells with nan (null values)
telco_df = telco_df.replace(' ', np.nan)

In [None]:
telco_df.info()

In [None]:
# drop the nulls...
telco_df = telco_df.dropna(axis=0)
telco_df.info()

In [None]:
# convert payment types to 1:auto-pay, 0:not auto-pay
telco_df['auto_pay'] = telco_df['payment_type_id'].replace({1:0, 2:0, 3:1, 4:1})

In [None]:
# create DSL column where 1:has DSL, 0:No DSL
telco_df['dsl'] = telco_df['internet_service_type_id'].replace({1:1, 2:0, 3:0})

In [None]:
# create Fiber column where 1:has Fiber service, 0:No Fiber
telco_df['fiber'] = telco_df['internet_service_type_id'].replace({1:0, 2:1, 3:0})

In [None]:
# create Has Internet column where 1:Has Internet 0:No internet service
telco_df['has_internet'] = telco_df['internet_service_type_id'].replace({1:1, 2:1, 3:0})

In [None]:
# separte contract_type_id into three columns...
# create m2m column where 1:Month-to-Month service, 2:contract
telco_df['m2m'] = telco_df['contract_type_id'].replace({1:1, 2:0, 3:0})

In [None]:
# create one_year column where 1:One year contract, 0:no contract, or m2m
telco_df['one_year'] = telco_df['contract_type_id'].replace({1:0, 2:1, 3:0})

In [None]:
# create teo_year column where 1:two year contract, 0:less than 2 year contract
telco_df['two_year'] = telco_df['contract_type_id'].replace({1:0, 2:0, 3:1})

In [None]:
# create has contract column where 1:has contract, 0:no contract
telco_df['has_contract'] = telco_df['contract_type_id'].replace({1:0, 2:1, 3:1})

In [None]:
# create column to convert gender to int 1:male, 0:female
telco_df['is_male'] = telco_df['gender'].replace({'Male':1, 'Female':0})

In [None]:
# create reusable dictionary for replacing 'No', 'Yes', 'No internet service', 'No phone service'
rep_dict = {
    'No': 0,
    'Yes': 1,
    'No internet service': 0,
    'No phone service': 0
}

In [None]:
# create has_partner column where 1:has partner, 0:no partner
telco_df['has_partner'] = telco_df['partner'].replace(rep_dict)

In [None]:
# create has_dep column where 1:has dependents, 0:no dependents
telco_df['has_dep'] = telco_df['dependents'].replace(rep_dict)

In [None]:
# better identify tenure in months by renaming column...
telco_df['tenure_months'] = telco_df['tenure']

In [None]:
# create has_phone column where 1:has phone, 0:no phone
telco_df['has_phone'] = telco_df['phone_service'].replace(rep_dict)

In [None]:
# create multi_phone column where 1:multiple phone lines, 0:One or fewer phone lines
telco_df['multi_phone'] = telco_df['multiple_lines'].replace(rep_dict)

In [None]:
# create security column where 1:has online security, 2:no security
telco_df['has_security'] = telco_df['online_security'].replace(rep_dict)

In [None]:
# create has_backup column where 1:has online backup, 0:no backup
telco_df['has_backup'] = telco_df['online_backup'].replace(rep_dict)

In [None]:
# create has_protection column where 1:has device protection, 0:no device protection
telco_df['has_protection'] = telco_df['device_protection'].replace(rep_dict)

In [None]:
# create has_support column where 1:has tech support, 0:no tech support
telco_df['has_support'] = telco_df['tech_support'].replace(rep_dict)

In [None]:
# create stream_tv column where 1:streams tv, 0:no streaming tv
telco_df['stream_tv'] = telco_df['streaming_tv'].replace(rep_dict)

In [None]:
# create stream_movies column where 1:streams movies, 0:no streaming movies
telco_df['stream_movies'] = telco_df['streaming_movies'].replace(rep_dict)

In [None]:
# create has_paperless column where 1:has paperless billing, 0:no paperless billing
telco_df['has_paperless'] = telco_df['paperless_billing'].replace(rep_dict)

In [None]:
# convert total_charges to float
telco_df['total_charges'] = telco_df['total_charges'].astype(float)

In [None]:
# create has_churn column where 1:has churn, 0:no churn
telco_df['has_churn'] = telco_df['churn'].replace(rep_dict)

In [None]:
telco_df.info()

In [None]:
# convert gender to int...
# create dictionary to map Male and Female as 1,0
gender = {
    'Male': 1,
    'Female': 0
}
# create new column 'is_male' that returns 1 if male, 0 if female
telco_df['is_male'] = telco_df['gender'].map(gender)
# drops old 'gender' column
telco_df = telco_df.drop(columns = 'gender')

In [None]:
# convert partner to int...
# create dictionary to map 'Yes' partner and 'No' partner as 1,0
partner = {
    'Yes': 1,
    'No': 0
}
# create new column 'is_male' that returns 1 if male, 0 if female
telco_df['has_partner'] = telco_df['partner'].map(partner)
# drops old 'partner' column
telco_df = telco_df.drop(columns = 'partner')

In [None]:
# convert dependents to int...
# create dictionary to map 'Yes' dependents and 'No' dependents as 1,0
dependents = {
    'Yes': 1,
    'No': 0
}
# create new column 'is_male' that returns 1 if male, 0 if female
telco_df['has_dependents'] = telco_df['dependents'].map(dependents)
# drops old 'dependents' column
telco_df = telco_df.drop(columns = 'dependents')

In [None]:
# convert phone service to int...
# create dictionary to map 'Yes' phone service and 'No' phone service as 1,0
has_phone = {
    'Yes': 1,
    'No': 0
}
# create new column 'is_male' that returns 1 if male, 0 if female
telco_df['has_phone'] = telco_df['phone_service'].map(has_phone)
# drops old 'phone_service' column
telco_df = telco_df.drop(columns = 'phone_service')

In [None]:
# convert multiple phone lines to int...
# create dictionary to map 'No phone service', 'No', and 'Yes' as 0, 1, 2.
qty_phone = {
    'No phone service': 0,
    'No': 1,
    'Yes': 2
}
# create new column 'qty_phone' that returns 0 if none, 1 if 1, and 2 if multiple
telco_df['qty_phone'] = telco_df['multiple_lines'].map(qty_phone)
# drops old 'multiple_lines' column
telco_df = telco_df.drop(columns = 'multiple_lines')

In [None]:
# convert 'online_security' to int...
# create dictionary to map 'No internet service', 'No', and 'Yes' as 0, 1, 2.
online_3 = {
    'No internet service': 0,
    'No': 1,
    'Yes': 2
}
# create new column 'online_sec' that returns 0 if 'No intenet service', 1 if 'No', and 2 if 'Yes'
telco_df['online_sec'] = telco_df['online_security'].map(online_3)
# drops old 'online_security' column
telco_df = telco_df.drop(columns = 'online_security')

In [None]:
# convert 'online_backup' to int...
# create dictionary to map 'No internet service', 'No', and 'Yes' as 0, 1, 2.
online_3 = {
    'No internet service': 0,
    'No': 1,
    'Yes': 2
}
# Edit column 'online_backup' so that it returns 0 if 'No intenet service', 1 if 'No', and 2 if 'Yes'
telco_df['online_backup'] = telco_df['online_backup'].map(online_3)

In [None]:
telco_df.nunique()

In [None]:
cont_vars = ['customer_id', 'tenure', 'monthly_charges', 'total_charges']