# Preprocessing and Train-Test Split

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("../data/interim/cleaned_df.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
Country,United States,United States,United States,United States,United States
State,California,California,California,California,California
City,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles
Zip Code,90003,90005,90006,90010,90015
Lat Long,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953","34.062125, -118.315709","34.039224, -118.266293"
Latitude,33.964131,34.059281,34.048013,34.062125,34.039224
Longitude,-118.272783,-118.30742,-118.293953,-118.315709,-118.266293
Gender,Male,Female,Female,Female,Male
Senior Citizen,No,No,No,No,No
Partner,No,No,No,Yes,No


In [4]:
df['City'].nunique()

1129

In [5]:
df['Zip Code'].nunique()

1652

Both `City` and `Zip Code` column has too many categories. However, EDA has revealed that customer location has high correlation with the target variable. hence we group zip codes into counties and include that information instead of the actual zip code values. As a result, we drop the Lat and Long columns as well.

In [6]:
# Load zip code file
zip_code_df = pd.read_csv("../data/external/zip_code_database.csv", usecols=['zip', 'state', 'county'])

# Filter for California zip codes
zip_code_df = zip_code_df.loc[zip_code_df['state']=='CA', ['zip', 'county']]

# Combine zip code and county info with customer data frame.
df = df.merge(zip_code_df, left_on='Zip Code', right_on = 'zip')

In [7]:
drop_columns = ['Country', 'State', 'City', 'Lat Long', 'Latitude', 'Longitude', 
                'Churn Label', 'Churn Reason', 'zip', 'Zip Code']
numeric_features = ['Tenure Months', 'Monthly Charges', 'Churn Score', 'CLTV']
categorical_features = ['Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method']

In [8]:
# Remove unnecessary columns

df.drop(columns = drop_columns, inplace=True)
df.columns

Index(['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charges', 'Churn Value', 'Churn Score',
       'CLTV', 'county'],
      dtype='object')

#### Preprocessing of numerical columns

As there are no missing values, imputation is not needed.

In [34]:
# Perform scaling of numerical columns

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

num_df = df[numeric_features].copy()
num_df.head()

Unnamed: 0,Tenure Months,Monthly Charges,Churn Score,CLTV
0,2,53.85,86,3239
1,1,18.8,51,5160
2,3,80.0,76,4264
3,59,94.75,26,5238
4,5,80.1,22,5225


In [35]:
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(data=scaler.fit_transform(num_df), columns = num_df.columns, index=num_df.index)
scaled_df.head()

Unnamed: 0,Tenure Months,Monthly Charges,Churn Score,CLTV
0,0.027778,0.354229,0.852632,0.27485
1,0.013889,0.005473,0.484211,0.702024
2,0.041667,0.614428,0.747368,0.50278
3,0.819444,0.761194,0.221053,0.719368
4,0.069444,0.615423,0.178947,0.716478


In [32]:
num_df


Unnamed: 0,Tenure Months,Monthly Charges,Churn Score,CLTV
0,2,53.85,86,3239
1,1,18.80,51,5160
2,3,80.00,76,4264
3,59,94.75,26,5238
4,5,80.10,22,5225
...,...,...,...,...
7038,64,81.25,67,4761
7039,60,80.95,71,6214
7040,33,20.10,62,3342
7041,2,100.20,23,4855


#### Preprocessing of categorical columns

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
le = LabelEncoder()
#encoded_county = pd.DataFrame(data=scaler.fit_transform(num_df), columns = num_df.columns, index=num_df.index)
encoded_county = le.fit_transform(df['county'])
encoded_df = pd.DataFrame(encoded_county, columns = ['county'], index=df.index)

In [38]:
cat_df = pd.get_dummies(df[categorical_features], drop_first=True)

processed_final = pd.concat([scaled_df, encoded_df, cat_df], axis=1)
processed_final['Churn Value'] = df['Churn Value']
processed_final.head()

Unnamed: 0,Tenure Months,Monthly Charges,Churn Score,CLTV,county,Gender_Male,Senior Citizen_Yes,Partner_Yes,Dependents_Yes,Phone Service_Yes,...,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Churn Value
0,0.027778,0.354229,0.852632,0.27485,18,1,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
1,0.013889,0.005473,0.484211,0.702024,18,1,0,1,0,1,...,0,1,0,1,0,0,0,0,1,0
2,0.041667,0.614428,0.747368,0.50278,18,0,0,1,1,1,...,0,0,0,0,0,1,0,1,0,0
3,0.819444,0.761194,0.221053,0.719368,18,0,0,1,0,1,...,1,0,0,0,0,1,0,1,0,0
4,0.069444,0.615423,0.178947,0.716478,18,1,0,0,0,1,...,1,0,0,0,0,1,0,0,1,0


#### Split into training and test 

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
train_df, test_df = train_test_split(processed_final, test_size=0.2, random_state = 7)
print(train_df.shape)
print(test_df.shape)

(5634, 33)
(1409, 33)


In [41]:
train_df.to_csv('../data/processed/train.csv')
test_df.to_csv('../data/processed/test.csv')
processed_final.to_csv('../data/processed/final.csv', index=False)