In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle
import os

In [None]:
def build_path(*path_parts):
    return os.path.join(*path_parts)

In [None]:
## Load the dataset
path = build_path('Data','Churn_Modelling.csv')
data = pd.read_csv(path)
data.head()

In [None]:
## Preprocess the data
## drop irrelevant columns
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
## Encode Categorical Variable
label_encoder_gender = LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])
data

In [None]:
## Onehot Encode 'Geography column'
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo = OneHotEncoder(sparse_output=False)
geo_encoded = onehot_encoder_geo.fit_transform(data[['Geography']])
geo_encoded_df = pd.DataFrame(geo_encoded, columns=onehot_encoder_geo.get_feature_names_out(['Geography']))
data = pd.concat([data.reset_index(drop=True), geo_encoded_df.reset_index(drop=True)], axis=1).drop('Geography', axis=1)

In [None]:
data.head()

In [None]:
## save the encoder and scaler
path = build_path('Encoder','label_encoder_gender.pkl')
with open(path,'wb') as file:
    pickle.dump(label_encoder_gender,file)

path = build_path('Encoder','onehot_encoder_geo.pkl')   
with open(path,'wb') as file:
    pickle.dump(onehot_encoder_geo,file)

In [None]:
## Divide the dataset in into independent and dependent features
X = data.drop('Exited',axis=1)
y = data['Exited']

## Split data in training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Scale these features
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
pd.DataFrame(X_train).to_csv(build_path('Data', 'X_train.csv'), index=False)
pd.DataFrame(X_test).to_csv(build_path('Data', 'X_test.csv'), index=False)
y_train.to_csv(build_path('Data', 'y_train.csv'), index=False)
y_test.to_csv(build_path('Data', 'y_test.csv'), index=False)

In [None]:
X_train

In [None]:
y_test

In [None]:
with open(build_path('Encoder','Scaler.pkl'),'wb') as file:
    pickle.dump(scaler,file)