In [260]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import keras
#import seaborn as sns
#import warnings
#warnings.filterwarnings("ignore", category = DeprecationWarning)
import matplotlib as mpl
import matplotlib.pyplot as plt
from keras import layers
from keras import models
from keras.models import Sequential 
from keras.layers import Dense, Activation
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

# Pipeline at a glance:
load data -> cleaning/preprocessing -> feature eng -> format data for modelling -> fit model -> evaluate model -> generate submission file

## Step 1: Load data

In [261]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# for convenience
datasets = [df_train, df_test]

## Step 2: Basic cleaning and preprocessing

In [262]:
# Fill in missing values
mAge = pd.concat((df_train['Age'], df_test['Age']), axis=0).mean()
medFare = pd.concat((df_train['Fare'], df_test['Fare']), axis=0).median()
for df in datasets:
    df['Age'] = df['Age'].fillna(mAge)
    df['Fare'] = df['Fare'].fillna(medFare)
    df['Embarked'] = df['Embarked'].fillna('S')

## Step 3: Feature engineering

In [263]:
# find fare bins based on frequency rather than value
garbage, fare_bins = pd.qcut(df_train['Fare'].append(df_test['Fare']), 4, retbins=True)
# loops through both train and test set for convenience
for df in datasets:
    # bin age
    df['Age_binned'] = pd.cut(df['Age'], [0,16,32,48,64,200], labels = [0,1,2,3,4], retbins=False)
    
    # bin fare
    df['Fare_binned'] = pd.cut(df['Fare'], fare_bins, labels = [0,1,2,3], include_lowest=True, retbins=False)
    
    # family features
    df['Family_size'] = df_train['SibSp'] + df_train['Parch']
    df['Is_Alone'] = (df['Family_size'] == 0).astype(int)

In [264]:
# drop unneeded rows
for df in datasets:
    df.drop(['Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Fare_binned'], axis=1, inplace=True)
    
# need to keep passengerID for submission

## Step 4: Format data for modelling

In [265]:
# label encode categoricals 
le = preprocessing.LabelEncoder()
le.fit(df_train['Sex'])
df_train.loc[:,'Sex'] = le.transform(df_train['Sex'])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

le.fit(df_train['Embarked'])
df_train['Embarked'] = le.transform(df_train['Embarked'])
df_test['Embarked'] = le.transform(df_test['Embarked'])

In [266]:
# Split data into X and y, and select features to use
X_train = df_train.drop(['Survived', 'PassengerId'], axis=1)
y_train = df_train['Survived']
X_test = df_test.drop(['PassengerId'], axis=1)



In [267]:
#Split data X and y into train and val sets

seed = 42
np.random.seed(seed)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2)

    
#X = X_train.values
#y = y_train.values

#X_test_ = X_test.values
#X_test_ = X_test_.astype(np.float64, copy =0)





In [257]:
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Embarked       0
Age_binned     0
Family_size    0
Is_Alone       0
dtype: int64

In [258]:

    
model = Sequential()

model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu', input_dim = 6))
model.add(Dense(units = 9, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 5, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_53 (Dense)             (None, 9)                 63        
_________________________________________________________________
dense_54 (Dense)             (None, 9)                 90        
_________________________________________________________________
dense_55 (Dense)             (None, 5)                 50        
_________________________________________________________________
dense_56 (Dense)             (None, 1)                 6         
Total params: 209
Trainable params: 209
Non-trainable params: 0
_________________________________________________________________


## Step 5: Train (or fit) the model

In [259]:




model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

#callbacks = [EarlyStopping(monitor'val_loss', patience = 2),
            #ModelCheckpoint(filepath = 'best_model.h5', monitor = 'val_loss', save_best_only = Ture)]


#model.compile(
# optimizer = "adam",
# loss = "binary_crossentropy",
# metrics = ["accuracy"]
#)

model.fit(X, y, batch_size = 32, epochs = 200)

SyntaxError: invalid syntax (<ipython-input-259-929d3b5f7da8>, line 7)

## Step 6: Evaluate the model

In [245]:
# define a scoring function
#def acc(y: np.array, y_pred: np.array) -> float:
    #return np.sum(y_pred==y)/len(y)
    
results = model.fit(X_train, y_train, epochs = 100, verbose = 0, validation_data = (X_test, y_test))
y_val = np.round(model.predict(X_val))

#y_pred_test = pd.DataFrame(predictions)
    
    
#pred = model.predict(X_val)
#y_

#pred = np.argmax(pred,axis=1)
#y_compare = np.argmax(y_val,axis=1)
#score = metrics.accuracy(y_compare, pred)

scores = model.evaluate(X,y,batch_size = 30)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


acc: 61.62%


In [246]:
#print("Test-Accuracy:", np.mean(results.history["val_acc"]))

#history = model.fit(X,y, epochs = 200, batch_size = 10, verbose = 0)

#print(history.history.keys())



In [247]:
#print('LR train set accuracy', acc(y_train, y_pred_trn_lr))
#print('LR val set accuracy', acc(y_val, y_pred_val_lr))
#print('RF train set accuracy', acc(y_train, y_pred_trn_rf))
#print('RF val set accuracy', acc(y_val, y_pred_val_rf))

#plt.plot(history.history['acc'])
#plt.plot(history.history['val_acc'])
#plt.title('Model Accuracy')
#plt.ylabel('Accuracy')
#plt.xlabel('Epoch')
#plt.legend(['train', 'test'], loc='upper left')
#plt.show()

## Step 7: Generate Submission

In [248]:
# fit on whole dataset
#lr.fit(X, y)

# Predict for test set
y_pred = model.predict(X_test)


y_pred_test = (y_pred > 0.5).astype(int).reshape(X_test_.shape[0])

# Create a Kaggle submission
sub = pd.DataFrame({'PassengerId': df_test['PassengerId'],
                    'Survived': y_pred_test})


sub.to_csv('week_3_baseline.csv', index=False)