# Data Preprocessing

Before we start to do any modeling it would be best to do some preprocessing on our data and to split our data up.

In [1]:
# import libraries
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

SEED = 2667

In [2]:
# load data, output head
df = pd.read_csv("../data/interim/data.csv")
df.head()

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,day_minutes,day_calls,day_charge,...,eve_calls,eve_charge,night_minutes,night_calls,night_charge,intl_minutes,intl_calls,intl_charge,customer_service_calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
# drop phone_number
df.drop(columns="phone_number", inplace=True, errors="ignore")

In [4]:
# convert international_plan & voice_mail_plan to binary cols
mapping = {"yes": 1, "no": 0}
df.international_plan = df.international_plan.map(mapping)
df.voice_mail_plan = df.voice_mail_plan.map(mapping)

In [5]:
# split data into training and testing sets
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, stratify=y)

In [6]:
# perform one hot encoding
ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
train_ohe = pd.DataFrame(ohe.fit_transform(X_train[["state"]]), columns=ohe.categories_[0], index=X_train.index)
test_ohe = pd.DataFrame(ohe.transform(X_test[["state"]]), columns=ohe.categories_[0], index=X_test.index)

In [7]:
# drop state column and concate encoded data
X_train_ohe = pd.concat([X_train.iloc[:, 1:], train_ohe], axis=1)
X_test_ohe = pd.concat([X_test.iloc[:, 1:], test_ohe], axis=1)

In [8]:
# Scale our data
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train_ohe)
scaled_X_test = scaler.transform(X_test_ohe)

In [9]:
# make into dataframes
X_train_processed = pd.DataFrame(scaled_X_train, columns=X_train_ohe.columns)
X_test_processed = pd.DataFrame(scaled_X_test, columns=X_test_ohe.columns)

In [10]:
# combine training and testing into own dataframes
train = pd.concat([X_train_processed, y_train.reset_index(drop=True)], axis=1)
test = pd.concat([X_test_processed, y_test.reset_index(drop=True)], axis=1)

In [11]:
# output data to processed directory
train.to_csv("../data/processed/train.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)