# Support Vector Machines with Python

Welcome to the Decision Tree and Random Forest in Python for the Titanic Crash.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Getting Data

train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

**Since we already know the data of this problem, I am going to clean the data. But I know the it isn't the only way to clean it.**

In [3]:
# Data Cleaning

# Defining a fuction to fulfill the missing data in age column
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [4]:
# Applying to test and train data

train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [5]:
test['Fare'].loc[152]=test['Fare'].mean()
test['Fare'].loc[152]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


35.6271884892086

In [6]:
# Cleaning Cabin column

train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)

In [7]:
#Checking for NaN in data

train[train['Fare'].isna()==True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [8]:
#Checking for NaN in data

test[test['Fare'].isna()==True]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [10]:
## Converting Categorical Features and dropping unuseful in train data.

train_new = pd.get_dummies(data=train,columns=['Sex'],drop_first=True)
train_new = pd.get_dummies(data=train_new,columns=['Embarked'],drop_first=True)
train_new.drop(['Name','Ticket'],axis=1,inplace=True)
train_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [11]:
## Converting Categorical Features and dropping unuseful in test data

test_new = pd.get_dummies(data=test,columns=['Sex'],drop_first=True)
test_new = pd.get_dummies(data=test_new,columns=['Embarked'],drop_first=True)
test_new.drop(['Name','Ticket'],axis=1,inplace=True)
test_new.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


## Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [260]:
X_train,X_test,y_train,y_test=train_test_split(train_new.drop(['PassengerId','Survived'],axis=1),train_new['Survived'],
                                               test_size=0.33,random_state=101)

## Decision Trees

We'll start just by training a single decision tree.

In [235]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier().fit(X_train,y_train)

## Prediction and Evaluation 

Let's evaluate our decision tree.

In [237]:
from sklearn.metrics import classification_report,confusion_matrix
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

             precision    recall  f1-score   support

          0       0.77      0.87      0.82       169
          1       0.79      0.66      0.72       126

avg / total       0.78      0.78      0.78       295



[[147  22]
 [ 43  83]]


## Random Forests

Now let's compare the decision tree model to a random forest.

In [162]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=350)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [163]:
rfc_pred = rfc.predict(X_test)

In [164]:
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))

[[169  22]
 [ 29  75]]
             precision    recall  f1-score   support

          0       0.85      0.88      0.87       191
          1       0.77      0.72      0.75       104

avg / total       0.83      0.83      0.83       295



In [242]:
for i in range(1,10):
    rfc = RandomForestClassifier(n_estimators=i*50)
    rfc.fit(X_train, y_train)
    rfc_pred = rfc.predict(X_test)
    print(confusion_matrix(y_test,rfc_pred))
    print(classification_report(y_test,rfc_pred))
    print(i*50)

[[142  27]
 [ 34  92]]
             precision    recall  f1-score   support

          0       0.81      0.84      0.82       169
          1       0.77      0.73      0.75       126

avg / total       0.79      0.79      0.79       295

50
[[148  21]
 [ 35  91]]
             precision    recall  f1-score   support

          0       0.81      0.88      0.84       169
          1       0.81      0.72      0.76       126

avg / total       0.81      0.81      0.81       295

100
[[147  22]
 [ 36  90]]
             precision    recall  f1-score   support

          0       0.80      0.87      0.84       169
          1       0.80      0.71      0.76       126

avg / total       0.80      0.80      0.80       295

150
[[144  25]
 [ 33  93]]
             precision    recall  f1-score   support

          0       0.81      0.85      0.83       169
          1       0.79      0.74      0.76       126

avg / total       0.80      0.80      0.80       295

200
[[148  21]
 [ 37  89]]
          

In [244]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier

In [305]:
X_train,X_test,y_train,y_test=train_test_split(train_new.drop(['PassengerId','Survived'],axis=1),train_new['Survived'],
                                               test_size=0.33,random_state=72)

# Your code ...
rf = GradientBoostingClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))

[[168  18]
 [ 20  89]]
             precision    recall  f1-score   support

          0       0.89      0.90      0.90       186
          1       0.83      0.82      0.82       109

avg / total       0.87      0.87      0.87       295



In [26]:
X_Train2 = train_new.drop(['PassengerId','Survived'],axis=1)
y_Train2 = train_new['Survived']
X_Test2 = test_new.drop('PassengerId',axis=1)

In [306]:
rfc2 = GradientBoostingClassifier()
rfc2.fit(X_Train2, y_Train2)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [307]:
rfc2_pred = rfc2.predict(X_Test2)

In [308]:
## Create a DF from predictions and columns to start at 892

df = pd.DataFrame(data=rfc2_pred,index=test['PassengerId'],columns=["Survived"])
df.to_csv('Predictions_RF.csv')