In [40]:
import pandas as pd
import numpy as np
from acquire import get_telecom_data

In [41]:
df = get_telecom_data()

In [46]:
def telco_data_prep(data_split=False):
    '''
    Load telco_data as a pandas DataFrame
    Clean numeric types and creates dummy variables
    '''
    
    # Call function to load the telco dataset
    df = get_telco_data()
    
    # Clean and cast total_charges column from type object to float
    df.total_charges = df.total_charges.str.strip()
    
    # New customers that have recently signed up have value of ''. '' == 0 tenure and 0 total charges
    df.total_charges = df.total_charges.replace('', np.nan)  # Replace empty strings with np.nan
    df.total_charges = df.total_charges.astype("float")  # Cast the entire column from type string to float
    
    # Drop the observations where customers have not paid their first month with telco
    df.dropna(inplace=True)
    
    # Replace target variable strings('Yes'/'No') with int's(1/0)
    df.churn = np.where(df.churn == 'Yes', 1, 0)
    
    # Clean tenure columns
    df.rename(columns={'tenure':'tenure_in_months'}, inplace=True)
    df['tenure_in_years'] = round(df.tenure_in_months / 12, 2)
    
    # Collect the column name where values are categorical/strings/objects
    encoded_columns = df.nunique()[df.nunique() <= 4].index.to_list()  # Columns with 4 or less unique values
    encoded_columns.remove('churn')  # remove churn from this list, we've already converted the values to binary outcomes
    encoded_attributes = pd.get_dummies(df[encoded_columns], drop_first=True)  # with the remaining object columns, create dummy variables.
    
    df = pd.concat([df, encoded_attributes], axis=1)  # concat the original dataframe with the encoded attributes.
    
    encoded_columns.append('customer_id')  # append categorical columns with customer_id
    df.drop(columns=encoded_columns, inplace=True)  # drop all columns that are represented by pd.dummies
    churn = df[['churn']]
    
    df.drop(columns='churn', inplace=True)
    df = pd.concat([df, churn], axis=1)

    return df

In [45]:
df.head()

Unnamed: 0,payment_type_id,contract_type_id,internet_service_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,paperless_billing,monthly_charges,total_charges,churn,internet_service_type,contract_type,payment_type,one_phone_line,multiple_phone_lines,no_phone
0,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,59.9,542.4,No,DSL,Month-to-month,Mailed check,0,1,0
1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,Yes,...,Yes,69.4,571.45,No,DSL,Month-to-month,Credit card (automatic),1,0,0
2,1,1,1,0015-UOCOJ,Female,1,No,No,7,Yes,...,Yes,48.2,340.35,No,DSL,Month-to-month,Electronic check,1,0,0
3,1,1,1,0023-HGHWL,Male,1,No,No,1,No,...,Yes,25.1,25.1,Yes,DSL,Month-to-month,Electronic check,0,0,1
4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,No,...,No,30.5,30.5,Yes,DSL,Month-to-month,Bank transfer (automatic),0,0,1


In [49]:
df_demographics = df[['senior_citizen', 'partner', 'dependents']]

In [50]:
pd.crosstab(df_demographics.senior_citizen, df_demographics.dependents)

dependents,No,Yes
senior_citizen,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3882,2019
1,1051,91


## Phone Attribute

In [54]:
df['no_phone'] = np.where((df.phone_service=='No') \
                          & (df.multiple_lines=='No phone service'),
                          0,
                          df.phone_service)

In [55]:
# One phone line coded as 1.
df['phone_service'] = np.where((df.phone_service=='Yes') \
                               & (df.multiple_lines=='No'),
                               1,
                               df.phone_service)

In [56]:
df['phone_service'] = np.where((df.phone_service=='Yes') \
                               & (df.multiple_lines=='Yes'),
                               2,
                               df.phone_service)

## Internet Attribute