<h3>Titanic Competition</h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Data:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
PassengerId = test_df.PassengerId

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Filling Nan Vals

In [4]:
train_df.Age.fillna(train_df['Age'].median(), inplace=True)
train_df.Embarked.fillna(train_df['Embarked'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df.Age.fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df.Embarked.fillna(train_df['Embarked'].mode()[0], inplace=True)


#### Feature Engineering

In [5]:
train_df.Cabin.sample(10)

448    NaN
356    E33
160    NaN
612    NaN
180    NaN
14     NaN
203    NaN
545    NaN
393    D36
616    NaN
Name: Cabin, dtype: object

In [6]:
train_df["Cabin_multiple"] = train_df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else len(x.split(" ")))   #  lambda arguments: expression
train_df["Cabin_adv"] = train_df["Cabin"].apply(lambda x: str(x)[0])
train_df['Name_title'] = train_df.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
train_df['Family_size'] = train_df.SibSp + train_df.Parch + 1
train_df['Is_alone'] = train_df.Family_size.apply(lambda x: 1 if x == 1 else 0)
train_df['Fare_per_person'] = train_df.Fare / train_df.Family_size

##### Creating Dummies

In [7]:
dummies_sex = pd.get_dummies(train_df["Sex"], prefix="Sex", dtype=int)
train_df[dummies_sex.columns] = dummies_sex

In [8]:
dummies_cabin_adv = pd.get_dummies(train_df["Cabin_adv"], prefix="Cabin_adv", dtype=int)
train_df[dummies_cabin_adv.columns] = dummies_cabin_adv

In [9]:
dummies_Pclass= pd.get_dummies(train_df["Pclass"], prefix="Pclass", dtype=int)
train_df[dummies_Pclass.columns] = dummies_Pclass

In [10]:
dummies_embarked= pd.get_dummies(train_df["Embarked"], prefix="Embarked", dtype=int)
train_df[dummies_embarked.columns] = dummies_embarked

#### Test_df

In [11]:
test_df.Age.fillna(test_df['Age'].median(), inplace=True)
test_df.Fare.fillna(test_df["Fare"].median(), inplace=True)
test_df["Cabin_multiple"] = test_df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else len(x.split(" ")))   #  lambda arguments: expression
test_df["Cabin_adv"] = test_df["Cabin"].apply(lambda x: str(x)[0])
test_df['Name_title'] = test_df.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_df['Family_size'] = test_df.SibSp + test_df.Parch + 1
test_df['Is_alone'] = test_df.Family_size.apply(lambda x: 1 if x == 1 else 0)
test_df['Fare_per_person'] = test_df.Fare / test_df.Family_size


dummies_sex = pd.get_dummies(test_df["Sex"], prefix="Sex", dtype=int)
test_df[dummies_sex.columns] = dummies_sex

dummies_cabin_adv = pd.get_dummies(test_df["Cabin_adv"], prefix="Cabin_adv", dtype=int)
test_df[dummies_cabin_adv.columns] = dummies_cabin_adv

dummies_Pclass = pd.get_dummies(test_df["Pclass"], prefix="Pclass", dtype=int)
test_df[dummies_Pclass.columns] = dummies_Pclass

dummies_embarked = pd.get_dummies(test_df["Embarked"], prefix="Embarked", dtype=int)
test_df[dummies_embarked.columns] = dummies_embarked

name_count = test_df["Name_title"].value_counts().sort_index()
dummies_name_title = pd.get_dummies(test_df["Name_title"], columns=["Name_title"], dtype=int)
for index, count in enumerate(name_count):
    if count > 10:
        test_df[dummies_name_title.columns[index]] = dummies_name_title.iloc[:, index]

test_df['Is_child'] = 0
test_df.loc[test_df['Age'] < 15, 'Is_child'] = 1

bins = [0, 12, 20, 40, 80, np.inf]

labels = ['Age0-12', 'Age12-20', 'Age20-40', 'Age40-80', 'Age80>']
Age_Bin = pd.cut(test_df['Age'], bins=bins, labels=labels, right=True)
dummies_age_bin = pd.get_dummies(Age_Bin, dtype=int)
test_df[dummies_age_bin.columns] = dummies_age_bin

test_df['Fare_log'] = np.log1p(test_df['Fare'] + 1)
test_df["Fare_per_person"] = np.log1p(test_df['Fare_per_person'] + 1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df.Age.fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df.Fare.fillna(test_df["Fare"].median(), inplace=True)


In [12]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Cabin_multiple',
       'Cabin_adv', 'Name_title', 'Family_size', 'Is_alone', 'Fare_per_person',
       'Sex_female', 'Sex_male', 'Cabin_adv_A', 'Cabin_adv_B', 'Cabin_adv_C',
       'Cabin_adv_D', 'Cabin_adv_E', 'Cabin_adv_F', 'Cabin_adv_G',
       'Cabin_adv_T', 'Cabin_adv_n', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [13]:
name_count = train_df["Name_title"].value_counts().sort_index()
dummies_name_title = pd.get_dummies(train_df["Name_title"], columns=["Name_title"],dtype=int)
for index, count in enumerate(name_count):
    if count > 10:
        train_df[dummies_name_title.columns[index]] = dummies_name_title.iloc[:, index]


In [14]:
train_df['Is_child'] = 0
train_df.loc[train_df['Age'] < 15, 'Is_child'] = 1


In [15]:

bins = [0, 12, 20, 40, 80, np.inf]

labels = ['Age0-12', 'Age12-20', 'Age20-40', 'Age40-80', 'Age80>']
Age_Bin = pd.cut(train_df['Age'], bins=bins, labels=labels, right=True)
dummies_age_bin = pd.get_dummies(Age_Bin, dtype=int)
train_df[dummies_age_bin.columns] = dummies_age_bin

In [16]:
train_df['Fare_log'] = np.log1p(train_df['Fare'] + 1)
train_df["Fare_per_person"] = np.log1p(train_df['Fare_per_person'] + 1)

In [17]:
# To make sure that both train_df and test_df have the same columns
for col in train_df.columns:
    if col not in test_df.columns:
        test_df[col] = 0

# Ensure column order matches
test_df = test_df[train_df.columns]


In [18]:
drop_list = ["Cabin","Fare" ,"Ticket", "PassengerId", "Pclass", "Sex", "Embarked", "Name", "Embarked", "Cabin_adv","Name_title"]
train_df.drop(columns = drop_list, inplace = True)
test_df.drop(columns=drop_list, inplace = True)

#### Models

In [19]:
# Separate features (X) and target (y)
y_train = train_df['Survived']
X_train = train_df.drop('Survived', axis=1) # Drop the 'Survived' column from X_train

# For the test set, similarly separate features and a placeholder for target (if needed for structure, but usually not for prediction)
# If you intend to make predictions on test_df, X_test should not contain 'Survived'
X_test = test_df.drop('Survived', axis=1) # Drop the 'Survived' column from X_test
# y_test from test_df is usually not available for evaluation, as it's the column you're trying to predict.
# It was filled with NaN earlier, so it's not useful for evaluation anyway.
y_test = test_df['Survived']

In [20]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
columns_to_scale = ['Age', 'Fare_log', 'Fare_per_person', "Family_size"]
scaler = StandardScaler()

# Make copies of the original data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Apply scaling to the selected columns
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


In [21]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
# from xgboost import XGBClassifier

#### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81564246 0.82022472 0.80337079 0.8258427  0.84831461]
0.8226790534178645


In [23]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81564246 0.82022472 0.80337079 0.8258427  0.84831461]
0.8226790534178645


#### Random Forest Classifier

In [24]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81005587 0.78651685 0.84831461 0.73595506 0.8258427 ]
0.8013370158809867


In [25]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81564246 0.78651685 0.85393258 0.73595506 0.82022472]
0.8024543343167408


#### KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.82122905 0.79213483 0.83146067 0.79775281 0.82022472]
0.8125604167974391


In [27]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.76536313 0.79213483 0.80898876 0.80898876 0.84269663]
0.8036344234511331


#### Support Vector Classifier

In [28]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.84357542 0.8258427  0.82022472 0.79775281 0.85955056]
0.8293892411022534


#### Decision Tree

In [29]:
from sklearn import tree


dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.77653631 0.78089888 0.83707865 0.75280899 0.78089888]
0.7856443412215178


In [30]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train_scaled,y_train,cv=5)
print(cv)
print(cv.mean())

[0.7877095  0.78089888 0.84269663 0.75280899 0.78089888]
0.7890025735986441


#### GaussianNB

In [31]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.73184358 0.74157303 0.76404494 0.75842697 0.80337079]
0.7598518611512146


#### XGboost

In [32]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train
,y_train,cv=5)
print(cv)
print(cv.mean())

[0.81005587 0.80337079 0.84269663 0.7752809  0.83707865]
0.8136965664427844


In [33]:
#Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
#A "soft" classifier averages the confidence of each of the models. If a the average confidence is > 50% that it is a 1 it will be counted as such
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'hard') 

In [34]:
cv = cross_val_score(voting_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.82681564 0.81460674 0.84269663 0.81460674 0.85393258]
0.8305316678174629


In [35]:
voting_clf.fit(X_train_scaled,y_train)
y_hat_base_vc = voting_clf.predict(X_test_scaled).astype(int)
basic_submission = {'PassengerId': PassengerId, 'Survived': y_hat_base_vc}
basic_submission = pd.DataFrame(basic_submission)
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv('base_submission.csv', index=False)

#### Tuining

#### Logistic Regression (Tuned)

In [36]:
#simple performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [37]:
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train_scaled,y_train)
clf_performance(best_clf_lr,'Logistic Regression')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Logistic Regression
Best Score: 0.8260498399347185
Best Parameters: {'C': np.float64(1.623776739188721), 'max_iter': 2000, 'penalty': 'l2', 'solver': 'liblinear'}


#### Random Forest Classifier (Tuned)

In [107]:
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [800,1000,1200],
               'criterion':['gini','entropy'],
                                  'bootstrap': [False],
                                  'max_depth': [20],
                                  'max_features': ['sqrt'],
                                  'min_samples_leaf': [2],
                                  'min_samples_split': [10]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled,y_train)
clf_performance(best_clf_rf,'Random Forest')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Random Forest
Best Score: 0.8394890465130878
Best Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 800}


#### KNN (Tuned)

In [108]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [5],
              'weights' : ['uniform'],
              'algorithm' : ['auto'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train_scaled,y_train)
clf_performance(best_clf_knn,'KNN')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
KNN
Best Score: 0.8226853304877283
Best Parameters: {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}


#### SVC (tuned)

In [109]:
svc = SVC(probability = True)
param_grid = tuned_parameters = [ {'kernel': ['poly'], 'degree' : [2], 'C': [1]}]
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled,y_train)
clf_performance(best_clf_svc,'SVC')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


SVC
Best Score: 0.8316364321134895
Best Parameters: {'C': 1, 'degree': 2, 'kernel': 'poly'}


#### Xtreme Gradient Boosting (tuned)

In [43]:
from sklearn.model_selection import RandomizedSearchCV


xgb = XGBClassifier(random_state = 1, device="cuda")

param_grid = {
    'n_estimators': [20, 50, 100, 250, 500,1000],
    'colsample_bytree': [0.2, 0.5, 0.7, 0.8, 1],
    'max_depth': [2, 5, 10, 15, 20, 25, None],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'subsample': [0.5,0.6,0.7, 0.8, 0.9],
    'learning_rate':[.01,0.1,0.2,0.3,0.5, 0.7, 0.9],
    'gamma':[0,.01,.1,1,10,100],
    'min_child_weight':[0,.01,0.1,1,10,100],
    'sampling_method': ['uniform', 'gradient_based']
}

#clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
#best_clf_xgb = clf_xgb.fit(X_train_scaled,y_train)
#clf_performance(best_clf_xgb,'XGB')
clf_xgb_rnd = RandomizedSearchCV(xgb, param_distributions = param_grid, n_iter = 1000, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb_rnd = clf_xgb_rnd.fit(X_train_scaled,y_train)
clf_performance(best_clf_xgb_rnd,'XGB')

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i, fobj=obj)
  bst.update(dtrain, iteration=i

KeyboardInterrupt: 

In [113]:
best_lr = best_clf_lr.best_estimator_
best_rf = best_clf_rf.best_estimator_
best_xgb = best_clf_xgb.best_estimator_
best_knn = best_clf_knn.best_estimator_
best_svc = best_clf_svc.best_estimator_

In [None]:

voting_clf_hard = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'hard') 
voting_clf_soft = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'soft') 
voting_clf_all = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('lr', best_lr)], voting = 'soft') 
voting_clf_xgb = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('xgb', best_xgb), ('lr', best_lr)], voting = 'soft')

print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5).mean())

print('voting_clf_all :',cross_val_score(voting_clf_all,X_train,y_train,cv=5))
print('voting_clf_all mean :',cross_val_score(voting_clf_all,X_train,y_train,cv=5).mean())

print('voting_clf_xgb :',cross_val_score(voting_clf_xgb,X_train,y_train,cv=5))
print('voting_clf_xgb mean :',cross_val_score(voting_clf_xgb,X_train,y_train,cv=5).mean())

voting_clf_hard : [0.81005587 0.79775281 0.84269663 0.79213483 0.84831461]
voting_clf_hard mean : 0.8181909484652564
voting_clf_soft : [0.79329609 0.80898876 0.84831461 0.8258427  0.85393258]
voting_clf_soft mean : 0.8260749482141737
voting_clf_all : [0.81005587 0.80337079 0.84269663 0.79775281 0.84831461]
voting_clf_all mean : 0.8204381394764922
voting_clf_xgb : [0.84357542 0.81460674 0.84831461 0.82022472 0.85393258]
voting_clf_xgb mean : 0.8361308141359614


In [121]:
voting_clf_xgb.fit(X_train_scaled, y_train)

In [122]:
#Make Predictions 
voting_clf_hard.fit(X_train_scaled, y_train)
voting_clf_soft.fit(X_train_scaled, y_train)
voting_clf_all.fit(X_train_scaled, y_train)
voting_clf_xgb.fit(X_train_scaled, y_train)

best_rf.fit(X_train_scaled, y_train)
y_hat_vc_hard = voting_clf_hard.predict(X_test_scaled).astype(int)
y_hat_rf = best_rf.predict(X_test_scaled).astype(int)
y_hat_vc_soft =  voting_clf_soft.predict(X_test_scaled).astype(int)
y_hat_vc_all = voting_clf_all.predict(X_test_scaled).astype(int)
y_hat_vc_xgb = voting_clf_xgb.predict(X_test_scaled).astype(int)

In [123]:
#convert output to dataframe 
final_data = {'PassengerId':PassengerId, 'Survived': y_hat_rf}
submission = pd.DataFrame(data=final_data)

final_data_2 = {'PassengerId':PassengerId, 'Survived': y_hat_vc_hard}
submission_2 = pd.DataFrame(data=final_data_2)

final_data_3 = {'PassengerId':PassengerId, 'Survived': y_hat_vc_soft}
submission_3 = pd.DataFrame(data=final_data_3)

final_data_4 = {'PassengerId':PassengerId, 'Survived': y_hat_vc_all}
submission_4 = pd.DataFrame(data=final_data_4)

final_data_5 = {'PassengerId':PassengerId, 'Survived': y_hat_vc_xgb}
submission_5 = pd.DataFrame(data=final_data_5)

final_data_comp = {'PassengerId':PassengerId, 'Survived_vc_hard': y_hat_vc_hard, 'Survived_rf': y_hat_rf, 'Survived_vc_soft' : y_hat_vc_soft, 'Survived_vc_all' : y_hat_vc_all,  'Survived_vc_xgb' : y_hat_vc_xgb}
comparison = pd.DataFrame(data=final_data_comp)

In [124]:
#prepare submission files 
submission.to_csv('submission_rf.csv', index =False)
submission_2.to_csv('submission_vc_hard.csv',index=False)
submission_3.to_csv('submission_vc_soft.csv', index=False)
submission_4.to_csv('submission_vc_all.csv', index=False)
submission_5.to_csv('submission_vc_xgb2.csv', index=False)