# Data cleaning
Prepare data for EDA and modeling

## Reducing memory usage
## Removing duplicate data
## Fix structural Errors
## Filter unwanted outliers
# Handle missing data


In [2]:
import pandas as pd
import numpy as np


In [17]:
df = pd.read_csv("data/Telecom_customer churn.csv")

## Reducing Memory Usage

In [4]:
# Total memory usage
df.memory_usage(index=False,deep=True).sum()

181833162

In [5]:
# default column datatypes
df.dtypes

rev_Mean       float64
mou_Mean       float64
totmrc_Mean    float64
da_Mean        float64
ovrmou_Mean    float64
                ...   
kid11_15        object
kid16_17        object
creditcd        object
eqpdays        float64
Customer_ID      int64
Length: 100, dtype: object

In order to reduce the total amount of memory usage, we can use a few strategies.

1. Assign new datatypes (ex. float64 -> float16).

dtypes:
- int8 can store integers from -128 to 127.
- int16 can store integers from -32768 to 32767.
- int64 can store integers from -9223372036854775808 to 9223372036854775807.
- object -> numerical or categorical

2. Drop NA

Reducing the total number of records.



In [6]:
# helper function
# Credit to arjanso  (https://www.kaggle.com/arjanso) """

def reduce_mem_usage(props):
    
    start_mem_usg = props.memory_usage(index=False,deep=True).sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage(index=False,deep=True).sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print(f"Memory usage reduced by: {100-(100*mem_usg/start_mem_usg):.2f}%")
    return props, NAlist

In [7]:
df, NAlist = reduce_mem_usage(df)

Memory usage of properties dataframe is : 173.4096164703369  MB
******************************
Column:  rev_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  mou_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  totmrc_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  da_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  ovrmou_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  ovrrev_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  vceovr_Mean
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  datovr_Mea

In [8]:
NAlist

['rev_Mean',
 'mou_Mean',
 'totmrc_Mean',
 'da_Mean',
 'ovrmou_Mean',
 'ovrrev_Mean',
 'vceovr_Mean',
 'datovr_Mean',
 'roam_Mean',
 'change_mou',
 'change_rev',
 'avg6mou',
 'avg6qty',
 'avg6rev',
 'hnd_price',
 'phones',
 'models',
 'truck',
 'rv',
 'lor',
 'adults',
 'income',
 'numbcars',
 'forgntvl',
 'eqpdays']

In [9]:
drop_na = df.dropna()
drop_na.memory_usage(index=False,deep=True).sum()/1024**2

53.6279411315918

In [10]:
print(df.shape)
print(drop_na.shape)
print(f"{df.shape[0]-drop_na.shape[0]} records removed")

(100000, 100)
(37487, 100)
62513 records removed


In [11]:
converted_obj = pd.DataFrame()
for col in df.columns:
    if df[col].dtype != object:
        num_unique_values = len(df[col].unique())
        num_total_values = len(df[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = df[col].astype('category')
        else:
            converted_obj.loc[:,col] = df[col]
    else:
        converted_obj.loc[:,col] = df[col]

In [12]:
print(df.memory_usage(index=False,deep=True).sum()/1024**2)
print(converted_obj.memory_usage(index=False,deep=True).sum()/1024**2)
print(drop_na.memory_usage(index=False,deep=True).sum()/1024**2)

138.2190341949463
140.44856071472168
53.6279411315918


In [18]:
duplicateDFRow = df[df.duplicated()]
print(duplicateDFRow)

Empty DataFrame
Columns: [rev_Mean, mou_Mean, totmrc_Mean, da_Mean, ovrmou_Mean, ovrrev_Mean, vceovr_Mean, datovr_Mean, roam_Mean, change_mou, change_rev, drop_vce_Mean, drop_dat_Mean, blck_vce_Mean, blck_dat_Mean, unan_vce_Mean, unan_dat_Mean, plcd_vce_Mean, plcd_dat_Mean, recv_vce_Mean, recv_sms_Mean, comp_vce_Mean, comp_dat_Mean, custcare_Mean, ccrndmou_Mean, cc_mou_Mean, inonemin_Mean, threeway_Mean, mou_cvce_Mean, mou_cdat_Mean, mou_rvce_Mean, owylis_vce_Mean, mouowylisv_Mean, iwylis_vce_Mean, mouiwylisv_Mean, peak_vce_Mean, peak_dat_Mean, mou_peav_Mean, mou_pead_Mean, opk_vce_Mean, opk_dat_Mean, mou_opkv_Mean, mou_opkd_Mean, drop_blk_Mean, attempt_Mean, complete_Mean, callfwdv_Mean, callwait_Mean, churn, months, uniqsubs, actvsubs, new_cell, crclscod, asl_flag, totcalls, totmou, totrev, adjrev, adjmou, adjqty, avgrev, avgmou, avgqty, avg3mou, avg3qty, avg3rev, avg6mou, avg6qty, avg6rev, prizm_social_one, area, dualband, refurb_new, hnd_price, phones, models, hnd_webcap, truck, rv