# Prepare Notebook

##### This is where I am writing, and testing my prepare functions for my telco churn classification project

In [1]:
# general imports (from big libraries)
import pandas as pd
import numpy as np
import os

In [2]:
# imports specific to prepare
from sklearn.model_selection import train_test_split

In [3]:
# my specific imports
# this brings in my login credentials to the Codeup database
from env import host, user, password
# this gives me access to my acquire functions
import acquire

In [4]:
# testing my get_telco_data function which works on top of my new_telco_data function
df = acquire.get_telco_data()

In [5]:
# did we get everything?
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [6]:
def prep_1_telco(df):
    '''
    This function will take in my freshly acquired telco_df and generate new columns with numerical values
    of all object columns, as well as any column that has multiple variables (like 0, 1, or multiple.)
    This function intentionally keeps all old columns so that I can compare to ensure data integrity.
    It will also reset the index to 'customer_id' and drop the 11 values that have a missing 'total_charge'
    since they are brand new customers who are yet to receive a bill.

    '''
    # create reusable dictionary for replacing 'No', 'Yes', 'No internet service', 'No phone service'
    rep_dict = {'No': 0, 'Yes': 1, 'No internet service': 0, 'No phone service': 0}
    # reset the customer_id to be the index
    df = df.set_index('customer_id')
    # This replaces empty cells with nan (null values)
    df = df.replace(' ', np.nan)
    # drop the nulls...
    df = df.dropna(axis=0)
    # convert payment types to 1:auto-pay, 0:not auto-pay
    df['auto_pay'] = df['payment_type_id'].replace({1:0, 2:0, 3:1, 4:1})
    # create DSL column where 1:has DSL, 0:No DSL
    df['dsl'] = df['internet_service_type_id'].replace({1:1, 2:0, 3:0})
    # create Fiber column where 1:has Fiber service, 0:No Fiber
    df['fiber'] = df['internet_service_type_id'].replace({1:0, 2:1, 3:0})
    # create Has Internet column where 1:Has Internet 0:No internet service
    df['has_internet'] = df['internet_service_type_id'].replace({1:1, 2:1, 3:0})
    # separte contract_type_id into three columns...
    # create m2m column where 1:Month-to-Month service, 2:contract
    df['m2m'] = df['contract_type_id'].replace({1:1, 2:0, 3:0})
    # create one_year column where 1:One year contract, 0:no contract, or m2m
    df['one_year'] = df['contract_type_id'].replace({1:0, 2:1, 3:0})
    # create teo_year column where 1:two year contract, 0:less than 2 year contract
    df['two_year'] = df['contract_type_id'].replace({1:0, 2:0, 3:1})
    # create has contract column where 1:has contract, 0:no contract
    df['has_contract'] = df['contract_type_id'].replace({1:0, 2:1, 3:1})
    # create column to convert gender to int 1:male, 0:female
    df['is_male'] = df['gender'].replace({'Male':1, 'Female':0})
    # create has_partner column where 1:has partner, 0:no partner
    df['has_partner'] = df['partner'].replace(rep_dict)
    # create has_dep column where 1:has dependents, 0:no dependents
    df['has_dep'] = df['dependents'].replace(rep_dict)
    # better identify tenure in months by renaming column...
    df['tenure_months'] = df['tenure']
    # create has_phone column where 1:has phone, 0:no phone
    df['has_phone'] = df['phone_service'].replace(rep_dict)
    # create multi_phone column where 1:multiple phone lines, 0:One or fewer phone lines
    df['multi_phone'] = df['multiple_lines'].replace(rep_dict)
    # create security column where 1:has online security, 2:no security
    df['has_security'] = df['online_security'].replace(rep_dict)
    # create has_backup column where 1:has online backup, 0:no backup
    df['has_backup'] = df['online_backup'].replace(rep_dict)
    # create has_protection column where 1:has device protection, 0:no device protection
    df['has_protection'] = df['device_protection'].replace(rep_dict)
    # create has_support column where 1:has tech support, 0:no tech support
    df['has_support'] = df['tech_support'].replace(rep_dict)
    # create stream_tv column where 1:streams tv, 0:no streaming tv
    df['stream_tv'] = df['streaming_tv'].replace(rep_dict)
    # create stream_movies column where 1:streams movies, 0:no streaming movies
    df['stream_movies'] = df['streaming_movies'].replace(rep_dict)
    # create has_paperless column where 1:has paperless billing, 0:no paperless billing
    df['has_paperless'] = df['paperless_billing'].replace(rep_dict)
    # convert total_charges to float
    df['total_charges'] = df['total_charges'].astype(float)
    # create has_churn column where 1:has churn, 0:no churn
    df['has_churn'] = df['churn'].replace(rep_dict)

    return df

In [7]:
# did I lose any existing info?
# did I create all the appropriate columns?
df = prep_1_telco(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0016-QLJIS to 9986-BONCE
Data columns (total 45 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7032 non-null   int64  
 1   internet_service_type_id  7032 non-null   int64  
 2   contract_type_id          7032 non-null   int64  
 3   gender                    7032 non-null   object 
 4   senior_citizen            7032 non-null   int64  
 5   partner                   7032 non-null   object 
 6   dependents                7032 non-null   object 
 7   tenure                    7032 non-null   int64  
 8   phone_service             7032 non-null   object 
 9   multiple_lines            7032 non-null   object 
 10  online_security           7032 non-null   object 
 11  online_backup             7032 non-null   object 
 12  device_protection         7032 non-null   object 
 13  tech_support              7032 non-null   object 
 14

In [8]:
def prep_2_telco(df):
    '''
    This function will take in my wide telco DataFrame, and drop all object
    columns, as well as all integer columns that had more than 2 responses,
    and were converted into multiple columns in prep_1_telco.
    A list of those colums will be 
    '''

    # all object columns in a variable to drop
    obj_cols = list(df.select_dtypes('object').columns)
    # other columns to drop...
    other_drops = ['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'tenure']
    # combine all drops into 1 list
    all_drops = obj_cols + other_drops
    # create new dataframe that has been cleaned up
    df_1 = df.drop(columns=all_drops)

    return df_1

In [9]:
# let's see how removing the object / duplicate columns looks
df = prep_2_telco(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0016-QLJIS to 9986-BONCE
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   senior_citizen   7032 non-null   int64  
 1   monthly_charges  7032 non-null   float64
 2   total_charges    7032 non-null   float64
 3   auto_pay         7032 non-null   int64  
 4   dsl              7032 non-null   int64  
 5   fiber            7032 non-null   int64  
 6   has_internet     7032 non-null   int64  
 7   m2m              7032 non-null   int64  
 8   one_year         7032 non-null   int64  
 9   two_year         7032 non-null   int64  
 10  has_contract     7032 non-null   int64  
 11  is_male          7032 non-null   int64  
 12  has_partner      7032 non-null   int64  
 13  has_dep          7032 non-null   int64  
 14  tenure_months    7032 non-null   int64  
 15  has_phone        7032 non-null   int64  
 16  multi_phone      7032 non-null   int64  
 17  has_

In [16]:
# looking at categorical vs continuous variables
df.nunique()

senior_citizen        2
monthly_charges    1584
total_charges      6530
auto_pay              2
dsl                   2
fiber                 2
has_internet          2
m2m                   2
one_year              2
two_year              2
has_contract          2
is_male               2
has_partner           2
has_dep               2
tenure_months        72
has_phone             2
multi_phone           2
has_security          2
has_backup            2
has_protection        2
has_support           2
stream_tv             2
stream_movies         2
has_paperless         2
has_churn             2
dtype: int64

In [11]:
# ok, I have my tidy data, now let's split so we can start exploration and model building

def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on has_churn.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=1221, stratify=df.has_churn)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=1221, 
                                       stratify=train_validate.has_churn)
    return train, validate, test

In [12]:
train, validate, test = split_data(df)

In [13]:
# what do each of my new splits look like?
train.shape, validate.shape, test.shape

((3937, 25), (1688, 25), (1407, 25))

In [1]:
### Key Findings and Takeaways


In [None]:
yn_rep = {'Yes': 1, 'No': 0}