In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import keras
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
import matplotlib as mpl
import matplotlib.pyplot as plt
from keras import layers
from keras import models
from keras.models import Sequential 
from keras.layers import Dense, Activation
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from sklearn.preprocessing import StandardScaler

# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [26]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# for convenience
datasets = [df_train, df_test]

## Step 2: Basic cleaning and preprocessing

In [27]:
# Fill in missing values
mAge = pd.concat((df_train['Age'], df_test['Age']), axis=0).mean()
medFare = pd.concat((df_train['Fare'], df_test['Fare']), axis=0).median()
for df in datasets:
    df['Age'] = df['Age'].fillna(mAge)
    df['Fare'] = df['Fare'].fillna(medFare)
    df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [28]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df_train['Fare'].append(df_test['Fare']), 4, retbins=True)
# loops through both train and test set for convenience
for df in datasets:
    # bin age
    df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
    # bin fare
    df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
    # family features
    df['Family_size'] = df_train['SibSp'] + df_train['Parch']
    df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

In [29]:
# drop unneeded rows
for df in datasets:
    df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Fare_binned'], axis=1, inplace=True)
    
# need to keep passengerID for submission

## Step 4: Format data for modelling

In [30]:
# label encode categoricals 
le = preprocessing.LabelEncoder()
le.fit(df_train['Sex'])
df_train.loc[:,'Sex'] = le.transform(df_train['Sex'])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

le.fit(df_train['Embarked'])
df_train['Embarked'] = le.transform(df_train['Embarked'])
df_test['Embarked'] = le.transform(df_test['Embarked'])

In [31]:
# Split data into X and y, and select features to use
X = df_train.drop(['Survived', 'PassengerId'], axis=1)
y = df_train['Survived']
X_test = df_test.drop(['PassengerId'], axis=1)

In [32]:
# Split data X and y into train and val sets
X_train, X_val, y_train, y_val = train_test_split(
      X, y, test_size=0.2, random_state=42)

In [33]:
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Embarked       0
Age_binned     0
Family_size    0
Is_Alone       0
dtype: int64

In [34]:

    
model = Sequential()

model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu', input_dim = 6))
model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))


  #  [
    #Dense(7, input_shape=(1,), activation ='relu'),
   # Dense(14, activation = 'relu'),
    #Dense(2, activation = 'sigmoid')
#])

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 9)                 63        
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 50        
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 6         
Total params: 209
Trainable params: 209
Non-trainable params: 0
_________________________________________________________________


## Step 5: Train (or fit) the model

In [35]:




model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

#model.compile(
# optimizer = "adam",
# loss = "binary_crossentropy",
# metrics = ["accuracy"]
#)

model.fit(X_train, y_train, batch_size = 32, epochs = 200)

TypeError: sigmoid_cross_entropy_with_logits() got an unexpected keyword argument 'labels'

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=12, verbose=0, warm_start=False)

## Step 6: Evaluate the model

In [24]:
# define a scoring function
#def acc(y: np.array, y_pred: np.array) -> float:
    #return np.sum(y_pred==y)/len(y)
    
#results = model.fit(X_train, y_train, epochs = 100, verbose = 0, validation_data = (X_test, y_test))
    
y_val = model.predict(X_val)


In [25]:
#print("Test-Accuracy:", np.mean(results.history["val_acc"]))

In [26]:
#print('LR train set accuracy', acc(y_train, y_pred_trn_lr))
#print('LR val set accuracy', acc(y_val, y_pred_val_lr))
#print('RF train set accuracy', acc(y_train, y_pred_trn_rf))
#print('RF val set accuracy', acc(y_val, y_pred_val_rf))

LR train set accuracy 0.8047752808988764
LR val set accuracy 0.8044692737430168
RF train set accuracy 0.851123595505618
RF val set accuracy 0.7932960893854749


## Step 7: Generate Submission

In [15]:
# fit on whole dataset
#lr.fit(X, y)
y_val = model.predict(X_val)
y_pred_test = (y_val > 0.5).astype(int).reshape(X_val.shape[0])

# Predict for test set
#y_pred_test = lr.predict(X_test)
#y_pred_test = model.predict(X_test)

# Create a Kaggle submission
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})

sub.to_csv('week_3_baseline.csv', index=False)