## Now we will perform Feature Engineering

In [5]:
import pandas as pd 
import numpy as np

In [9]:

data = pd.read_csv("D:/Customer-Churn-ANN/Data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = pd.DataFrame(data)

In [17]:
df.describe()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce') # use this insted of astype cuz there are some ' ' values
df.isnull().sum()
df['TotalCharges'].fillna(0, inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [30]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Import libraries

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [36]:

X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})

X.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


## Seprate Categories

In [37]:
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

binary_categorical_features = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling','SeniorCitizen']

multi_categorical_features = ['Contract', 'InternetService', 'PaymentMethod',
    'OnlineSecurity', 'TechSupport',
    'StreamingTV', 'StreamingMovies'
]

In [47]:
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])


binary_transformer = OneHotEncoder(
    drop='if_binary',
    #sparse=False,
    handle_unknown='ignore'
)

multi_categorical_transformer = OneHotEncoder(
    drop='first',
    handle_unknown='ignore',
    #sparse=False
    )


In [49]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer ,numeric_features),
        ('bin', binary_transformer ,binary_categorical_features),
        ('multi', multi_categorical_transformer, multi_categorical_features)
    ]
)

In [51]:
#train test split

X_train, X_test , y_train , y_test = train_test_split(
    X,y,
    test_size = 0.2,
    stratify = y
)

In [55]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(X_test_processed.shape)
print(X_train_processed.shape)

(1409, 23)
(5634, 23)


## Save Seprated data

In [56]:
np.save("../data/X_train.npy", X_train_processed)
np.save("../data/X_test.npy", X_test_processed)
np.save("../data/y_train.npy", y_train.values)
np.save("../data/y_test.npy", y_test.values)