## Prepare Features For Modeling: Create Training And Test Sets

### Read In Data

In [1]:
# Read in data
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split

titanic = pd.read_csv('taitanic_cleaned_req.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Family_cnt,Cabin_ind,2,3
0,1,0,1,22.0,1.486167,1,0,0,1
1,2,1,0,38.0,2.347457,1,1,0,0
2,3,1,0,26.0,1.512864,0,0,0,1
3,4,1,0,35.0,2.213191,1,1,0,0
4,5,0,1,35.0,1.517606,0,0,0,1


### Split Into Train And Test Set

In [2]:
# Drop unnecccessary features and split into training/test sets
features = titanic.drop(['Survived','PassengerId'], axis=1)
labels = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

X_train.head()

Unnamed: 0,Sex,Age,Fare,Family_cnt,Cabin_ind,2,3
445,1,4.0,2.413307,2,1,0,0
650,1,29.699118,1.511747,0,0,0,1
172,0,1.0,1.619291,2,0,0,1
450,1,36.0,1.943805,3,0,1,0
314,1,43.0,1.922321,2,0,1,0


In [3]:
X_train.to_csv('train_features.csv', index=False)
X_test.to_csv('test_features.csv', index=False)

y_train.to_csv('train_labels.csv', index=False)
y_test.to_csv('test_labels.csv', index=False)

In [4]:
for dataset in [y_train, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.7
0.3


# Standardize Features

In [5]:
# Fit the scaler on the training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [6]:
# Scale the training, test, and validation sets
features = X_train.columns

X_train[features] = scaler.transform(X_train[features])
X_test[features] = scaler.transform(X_test[features])

X_train.head()

Unnamed: 0,Sex,Age,Fare,Family_cnt,Cabin_ind,2,3
445,0.720772,-1.972249,1.426604,0.679441,1.866016,-0.518497,-1.13969
650,0.720772,0.032613,-0.660902,-0.5898,-0.535901,-0.518497,0.877432
172,-1.387401,-2.206288,-0.411892,0.679441,-0.535901,-0.518497,0.877432
450,0.720772,0.524162,0.3395,1.314061,-0.535901,1.928652,-1.13969
314,0.720772,1.070253,0.289756,0.679441,-0.535901,1.928652,-1.13969


### Write Out All Data

In [7]:
X_train.to_csv('train_features_std.csv', index=False)
X_test.to_csv('test_features_std.csv', index=False)

y_train.to_csv('train_labels_std.csv', index=False)
y_test.to_csv('test_labels_std.csv', index=False)