### Submission attempts with Decision Trees, Random Forest and Gradient boosting

Here I will read in the training and test data already cleaned up from [this](053-Titanic_new.ipynb) notebook

In [67]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [68]:
# import the training data
# ['Title_Mr', 'Sex_male', 'Sex_female', 'Title_Mrs', 'Title_Miss', 'Pclass_3',
# 'Pclass_1', 'Fare', 'Embarked_C', 'Embarked_S']
# Joined the classification labels ('Survived') onto the training dataframe
train = pd.read_csv('/home/sophie/projects/Titanic/data/clean_train_53.csv',sep = " ", header=0, 
                      index_col=0).astype(np.float32)

#import the test data
test = pd.read_csv('/home/sophie/projects/Titanic/data/clean_test_53.csv', sep = " ", header=0, 
                   index_col=0).astype(np.float32)

In [69]:
print(train[0:3])
print(test[0:3]) # Doesn't have the survived column - this is what will be output

             Title_Mr  Sex_male  Sex_female  Title_Mrs  Title_Miss  Pclass_3  \
PassengerId                                                                    
1                 1.0       1.0         0.0        0.0         0.0       1.0   
2                 0.0       0.0         1.0        1.0         0.0       0.0   
3                 0.0       0.0         1.0        0.0         1.0       1.0   

             Pclass_1       Fare  Embarked_C  Embarked_S  Survived  
PassengerId                                                         
1                 0.0   7.250000         0.0         1.0       0.0  
2                 1.0  71.283302         1.0         0.0       1.0  
3                 0.0   7.925000         0.0         1.0       1.0  
             Title_Mr  Sex_male  Sex_female  Title_Mrs  Title_Miss  Pclass_3  \
PassengerId                                                                    
892               1.0       1.0         0.0        0.0         0.0       1.0   
893           

In [70]:
print(train.iloc[:,0:-1][0:3]) # goes into the decision tree model as x
print(train.iloc[:,-1][0:3]) # goes into the decision tree model as y

             Title_Mr  Sex_male  Sex_female  Title_Mrs  Title_Miss  Pclass_3  \
PassengerId                                                                    
1                 1.0       1.0         0.0        0.0         0.0       1.0   
2                 0.0       0.0         1.0        1.0         0.0       0.0   
3                 0.0       0.0         1.0        0.0         1.0       1.0   

             Pclass_1       Fare  Embarked_C  Embarked_S  
PassengerId                                               
1                 0.0   7.250000         0.0         1.0  
2                 1.0  71.283302         1.0         0.0  
3                 0.0   7.925000         0.0         1.0  
PassengerId
1    0.0
2    1.0
3    1.0
Name: Survived, dtype: float32


Start with the DecisionTreeClassifier

In [71]:
# Create the DecisionTreeClassifier object with optimal parameter values
tree = DecisionTreeClassifier(max_depth = 3, min_samples_split = 14, min_samples_leaf = 4, max_leaf_nodes = 6)

# Fit the training data to the Survived labels and create the decision trees. (x,y)(train_inputs, classification labels)
tree_fit = tree.fit(train.iloc[:,0:-1], train.iloc[:,-1])

In [72]:
# Take the same decision trees and run it on the test data. Skip the passengerId column
output = tree.predict(test) 

In [76]:
# Put the input series (PassengerId) and prediction values (Survived) into a dataframe and turn into Integers
d = {'PassengerId': test.index, 'Survived': output}

output = pd.DataFrame(data=d)

output = output.astype(int)

In [78]:
output.to_csv('/home/sophie/projects/Titanic/data/decisiontree_pred.csv', index = False)

### Create a submission with plain old Random Forest as before

In [79]:
# Create the RandomForestclassifier object with optimal parameter values
RF = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees. (x,y)(train_inputs, classification labels)
RF_fit = RF.fit(train.iloc[:,0:-1], train.iloc[:,-1])

# Take the random forest and run it on the test data. Skip the passengerId column
output = tree.predict(test) 

In [80]:
# Put the input series (PassengerId) and prediction values (Survived) into a dataframe and turn into Integers
d = {'PassengerId': test.index, 'Survived': output}

output = pd.DataFrame(data=d)

output = output.astype(int)

In [81]:
output.to_csv('/home/sophie/projects/Titanic/data/RF_pred_plain.csv', index = False)

#### Create a submission with a much larger number of estimators

In [82]:
# Create the RandomForestclassifier object with optimal parameter values
RF = RandomForestClassifier(n_estimators = 10000)

# Fit the training data to the Survived labels and create the decision trees. (x,y)(train_inputs, classification labels)
RF_fit = RF.fit(train.iloc[:,0:-1], train.iloc[:,-1])

# Take the random forest and run it on the test data. Skip the passengerId column
output = tree.predict(test) 

In [83]:
# Put the input series (PassengerId) and prediction values (Survived) into a dataframe and turn into Integers
d = {'PassengerId': test.index, 'Survived': output}

output = pd.DataFrame(data=d)

output = output.astype(int)

In [84]:
output.to_csv('/home/sophie/projects/Titanic/data/RF_pred_10000estimators.csv', index = False)

#### This gives exactly the same answer.

#### Try playing around with RF parameters as suggested in scikitlearn documentation

In [85]:
# Create the RandomForestclassifier object with optimal parameter values
RF = RandomForestClassifier(n_estimators = 100, min_samples_split=1)

# Fit the training data to the Survived labels and create the decision trees. (x,y)(train_inputs, classification labels)
RF_fit = RF.fit(train.iloc[:,0:-1], train.iloc[:,-1])

# Take the random forest and run it on the test data. Skip the passengerId column
output = tree.predict(test) 

In [86]:
# Put the input series (PassengerId) and prediction values (Survived) into a dataframe and turn into Integers
d = {'PassengerId': test.index, 'Survived': output}

output = pd.DataFrame(data=d)

output = output.astype(int)

In [87]:
output.to_csv('/home/sophie/projects/Titanic/data/RF_pred_minsample1.csv', index = False)

#### This is also exactly the same

### Next, try Gradient Boost

In [88]:
# Create the RandomForestclassifier object with optimal parameter values
RF = GradientBoostingClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees. (x,y)(train_inputs, classification labels)
RF_fit = RF.fit(train.iloc[:,0:-1], train.iloc[:,-1])

# Take the random forest and run it on the test data. Skip the passengerId column
output = tree.predict(test) 

In [89]:
# Put the input series (PassengerId) and prediction values (Survived) into a dataframe and turn into Integers
d = {'PassengerId': test.index, 'Survived': output}

output = pd.DataFrame(data=d)

output = output.astype(int)

In [90]:
output.to_csv('/home/sophie/projects/Titanic/data/RF_pred_gradboost.csv', index = False)