<a href="https://colab.research.google.com/github/TomJiX/ML_start/blob/main/kaggle_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [1]:
import numpy as np
import pandas as pd 
from keras.utils import to_categorical, normalize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import os
import time
import gc

#Import data

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c titanic


In [None]:
DATA="/content/train.csv"
TEST="/content/test.csv"

CHALLENGE_NAME="titanic"

# Load dataset.
dftrain = pd.read_csv(DATA)
dfeval = pd.read_csv(TEST)

dftrain.head()

Select usefull parameters

In [30]:
def clean_data_add_features(data):
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone']=(data['FamilySize']==1).astype(int)
    data=pd.get_dummies(data, columns=["Pclass","Sex"])
    data=data.fillna(data.mean())
    data["Age"]=(data["Age"]-data["Age"].min())/(data["Age"].max()-data["Age"].min())
    return data



In [31]:
useful_columns=["Pclass","Sex","Age","SibSp","Parch","Survived"]
data_set=pd.read_csv(DATA,usecols=useful_columns)
data_set=clean_data_add_features(data_set)

sub_set=pd.read_csv(TEST,usecols=["Pclass","Sex","Age","SibSp","Parch"])
sub_set=clean_data_add_features(sub_set)
   

labels_df = np.array(data_set['Survived'])# Remove the labels from the features
# axis 1 refers to the columns
features_df= data_set.drop('Survived', axis = 1)
# Saving feature names for later use
feature_list = list(features_df.columns)# Convert to numpy array
features = np.array(features_df)
submission = np.array(sub_set)

#Dataset data

In [None]:
DATA_SIZE=np.shape(features)
print(DATA_SIZE)

#Model params

In [None]:
DENSE_LAYERS=list(range(1,7)) #6
LAYER_SIZE=[16,32,64]  #64
FAST_RUN=False
print("{} Number of models".format(len(DENSE_LAYERS)*len(LAYER_SIZE)))

In [80]:
%rm -rf *-logs

In [None]:
%%capture

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau

earlystop = EarlyStopping(patience=10)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                        patience=2, 
                        verbose=1, 
                        factor=0.5, 
                        min_lr=0.00001)

EPOCHS=3 if FAST_RUN else 30
for dense_layer in DENSE_LAYERS:
        for layer_size in LAYER_SIZE:
            NAME = "{}-{}-dense-drop-{}".format(layer_size,dense_layer, int(time.time()))
            print(NAME)

            model = Sequential()

            model.add(Flatten(input_shape=(DATA_SIZE[1],)))

            for _ in range(dense_layer):
                model.add(Dense(layer_size,activation="relu"))
                #model.add(Dropout(0.25))

            model.add(Dense(1, activation='sigmoid')) # 2 because we have survive or not

            tensorboard = TensorBoard(log_dir="{}-logs19_11/{}".format(CHALLENGE_NAME,NAME))
            callbacks = [earlystop, learning_rate_reduction, tensorboard]
            model.compile(loss='binary_crossentropy',
                        optimizer='adam',
                        metrics=['accuracy'],
                        )
            model.fit(features, labels_df,
                    batch_size=32,
                    epochs=EPOCHS,
                    validation_split=0.3,
                    callbacks=callbacks)
            gc.collect()

%cp -R /content/*-logs* /content/drive/My\ Drive/Colab\ Notebooks/

# Particular NN

In [133]:
DENSE_LAYERS=5 #5
LAYER_SIZE=64  #64
FAST_RUN=False

#print("{} Number of models".format())

In [134]:
%%capture
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense, Activation, BatchNormalization,Input
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import SGD, RMSprop, Adamax, Adagrad, Adam, Nadam, SGD
earlystop = EarlyStopping(patience=10)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                        patience=2, 
                        verbose=1, 
                        factor=0.5, 
                        min_lr=0.00001)
callbacks = [earlystop, learning_rate_reduction]
EPOCHS=3 if FAST_RUN else 30
def create_model():
    

    NAME = "{}-{}-dense-{}".format(LAYER_SIZE,DENSE_LAYERS, int(time.time()))
    print(NAME)

    model = Sequential()


    model.add(Input(shape=DATA_SIZE[1]))

    for _ in range(DENSE_LAYERS):
        model.add(Dense(LAYER_SIZE,activation="relu"))
        #model.add(Dropout(0.25))

    model.add(Dense(1,activation='sigmoid'))

    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

NN_model=create_model()
history=NN_model.fit(features, labels_df,
        batch_size=32,
        epochs=EPOCHS,
        validation_split=0.3,
        callbacks=callbacks)
gc.collect()
NN_model_acc=history.history['accuracy'][-1]

In [None]:
NN_model_acc

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
ax1.plot(history.history['loss'], color='b', label="Training loss")
ax1.plot(history.history['val_loss'], color='r', label="validation loss")
ax1.set_xticks(np.arange(1, EPOCHS, 1))
ax1.set_yticks(np.arange(0, 1, 0.1))

ax2.plot(history.history['accuracy'], color='b', label="Training accuracy")
ax2.plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
ax2.set_xticks(np.arange(1, EPOCHS, 1))

legend = plt.legend(loc='best', shadow=True)
plt.tight_layout()
plt.show()

In [135]:
%%capture
def create_model2(input_shape=DATA_SIZE[1],
                number_hidden=4, 
                neurons_per_hidden=32,
                hidden_drop_rate= 0.2,
                hidden_activation = 'selu',
                hidden_initializer="lecun_normal",
                output_activation ='sigmoid',
                loss='binary_crossentropy',
                optimizer = Nadam(lr=0.0005),
                ):
    
    #create model
    model = Sequential()
    model.add(Input(shape=input_shape)),
    for layer in range(number_hidden):
        model.add(Dense(neurons_per_hidden, activation = hidden_activation ,kernel_initializer=hidden_initializer))
        #model.add(Dropout(hidden_drop_rate))
    model.add(Dense(1, activation = output_activation))

    # Compile model
    model.compile(loss=loss, 
                  #optimizer = Nadam(lr=lr), 
                  optimizer = Nadam(lr=0.0005),
                  metrics = ['accuracy'])
    return model
NN_model2=create_model2()
history=NN_model2.fit(features, labels_df,
        batch_size=32,
        epochs=EPOCHS,
        validation_split=0.3,
        callbacks=callbacks)
gc.collect()
NN_model_acc2=history.history['accuracy'][-1]

other model

Cross Validation

In [None]:
%%capture
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
n_split=10

cv1=[]
cv2=[]
for train_index,test_index in KFold(n_split).split(features):
    x_train,x_test=features[train_index],features[test_index]
    y_train,y_test=labels_df[train_index],labels_df[test_index]
    
    model1=create_model()
    model1.fit(x_train, y_train,epochs=20)
    cv1.append(model1.evaluate(x_test,y_test)[1])
    model2=create_model()
    model2.fit(x_train, y_train,epochs=20)
    cv2.append(model2.evaluate(x_test,y_test)[1])

cv_data=pd.DataFrame({'CV Mean':[np.mean(cv1),np.mean(cv2)],'Std':[np.std(cv1),np.std(cv2)],"Model_acc":[NN_model_acc,NN_model_acc2]},index=['NN_model','NN_model2'])

In [None]:
cv_data

#Trying out different model2

In [None]:
DENSE_LAYERS=list(range(1,7)) #6
LAYER_SIZE=[10,16,32,64]  #64
FAST_RUN=False
print("{} Number of models".format(len(DENSE_LAYERS)*len(LAYER_SIZE)))

In [129]:
%rm -rf *-logs*

In [None]:
%%capture
EPOCHS=3 if FAST_RUN else 30
for dense_layer in DENSE_LAYERS:
        for layer_size in LAYER_SIZE:
            NAME = "{}-{}-dense-{}".format(layer_size,dense_layer, int(time.time()))
            print(NAME)

            tensorboard = TensorBoard(log_dir="{}-logs19_11-m2/{}".format(CHALLENGE_NAME,NAME))
            callbacks = [earlystop, learning_rate_reduction, tensorboard]
            model=create_model2(number_hidden=dense_layer,neurons_per_hidden=layer_size)
          
            model.fit(features, labels_df,
                    batch_size=32,
                    epochs=EPOCHS,
                    validation_split=0.3,
                    callbacks=callbacks)
            gc.collect()

%cp -R /content/*-logs* /content/drive/My\ Drive/Colab\ Notebooks/

#Tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/titanic-logs19_11-m2

#Non NN models

In [119]:
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
import xgboost as xgb

In [120]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_df, test_size = 0.25, random_state = 42)

Try and Plot different classifier

In [None]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=3, shuffle=True) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
model_accuracy=[]
trained_models=[]
classifiers=['Linear Svm','Radial Svm','Decision Tree','Random Forest','Naive Bayes','Logistic Regression','KNN','XGBoost']
models=[svm.SVC(kernel='linear',C=0.1,gamma=0.1),svm.SVC(kernel='rbf',C=0.1,gamma=0.1),DecisionTreeClassifier(),RandomForestClassifier(n_estimators=100),GaussianNB(),LogisticRegression(),KNeighborsClassifier(),xgb.XGBClassifier(objective="binary:logistic", random_state=42)]
j=0
total=len(classifiers)
for i in models:
    model = i
    print("\r{} training... {}/{} Trained".format(classifiers[j],j,total),end="")
    j+=1
    model.fit(train_features,train_labels)
    trained_models.append(model)
    model_accuracy.append(metrics.accuracy_score(model.predict(test_features),test_labels))
    cv_result = cross_val_score(model,features_df,labels_df, cv = kfold,scoring = "accuracy")
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)

model_df=pd.DataFrame({'models':trained_models},index=classifiers)
model_df=pd.concat([model_df,pd.DataFrame({'models':[NN_model,NN_model2]}, index=["NN_model","NN_model2"])])
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std,"Model_acc":model_accuracy},index=classifiers)
new_models_dataframe2=pd.concat([new_models_dataframe2,cv_data])
new_models_dataframe2

In [None]:
print('Model with best accuracy : {}'.format(new_models_dataframe2['Model_acc'].idxmax(1)))
print('Model with best Cross validation mean : {}'.format(new_models_dataframe2['CV Mean'].idxmax(1)))

b_mod_name=(new_models_dataframe2['Model_acc']*1.1+new_models_dataframe2['CV Mean']*0.9).idxmax(1)
print('Model to choose : {}'.format(b_mod_name))
best_model=model_df['models'][b_mod_name]

#Particular fit , bagging and submission

In [None]:
ITER=10
CLASSIFIER= b_mod_name
print(CLASSIFIER)

In [124]:
%%capture

stack_predic=pd.DataFrame()
for i in range(ITER):
    shuffle = np.array(sorted(np.c_[train_features,train_labels], key=lambda k: random.random()))
    if CLASSIFIER == "NN_model":
        model=create_model()
        model.fit(np.array([x[:-1] for x in shuffle]),np.array([x[-1] for x in shuffle]),epochs=200)
        stack_predic[CLASSIFIER+str(i)]=model.predict(submission).flatten()
    if CLASSIFIER == "NN_model2":
        model=create_model2()
        model.fit(np.array([x[:-1] for x in shuffle]),np.array([x[-1] for x in shuffle]),epochs=200)
        stack_predic[CLASSIFIER+str(i)]=model.predict(submission).flatten()
    else:
        model = models[classifiers.index(CLASSIFIER)]
        model.fit(np.array([x[:-1] for x in shuffle]),np.array([x[-1] for x in shuffle]))
        stack_predic[CLASSIFIER+str(i)]=model.predict(submission)
        



In [None]:
stack_predic.head()


In [None]:
sub_csv=pd.read_csv(TEST,usecols=['PassengerId'])
sub_csv["Survived"]=np.round(stack_predic.mean(1).to_numpy()).astype(int)
sub_csv.head()



In [None]:
sub_csv.to_csv('submission.csv', index=False)
#!kaggle competitions submit -c titanic -f submission.csv -m "NN2"