In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
trainDf = pd.read_csv('\\datasets\\train.csv')
trainDf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X_train = trainDf.drop('Survived',axis = 1)

In [4]:
y_train = trainDf['Survived']

In [5]:
def initialPrep(X):
    X = X.set_index('PassengerId')
    X['FamilyWith'] = X['SibSp'] + X['Parch']
    del X['SibSp']
    del X['Parch']
    return X

In [6]:
X_train = initialPrep(X_train)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Name        891 non-null    object 
 2   Sex         891 non-null    object 
 3   Age         714 non-null    float64
 4   Ticket      891 non-null    object 
 5   Fare        891 non-null    float64
 6   Cabin       204 non-null    object 
 7   Embarked    889 non-null    object 
 8   FamilyWith  891 non-null    int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 69.6+ KB


In [8]:
numerical_data = ['Pclass', 'Age', 'Fare', 'FamilyWith']
categorical_data = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [9]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [10]:
num_pipeline = Pipeline([("select_numeric", DataFrameSelector(numerical_data)),('imputer',SimpleImputer(strategy = 'median')),('std_scaler', StandardScaler())])

In [11]:
cat_pipeline = Pipeline([("select_cat", DataFrameSelector(categorical_data)),('imputer', SimpleImputer(strategy = 'most_frequent')),("cat_encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [12]:
preprocess_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline),("cat_pipeline", cat_pipeline),])

In [13]:
X_train = preprocess_pipeline.fit_transform(X_train)

In [14]:
forest_clf = RandomForestClassifier(n_estimators=128, random_state=33)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8294132334581773

In [15]:
testDf = pd.read_csv('\\datasets\\test.csv')

In [16]:
X_test = initialPrep(testDf)
X_test = preprocess_pipeline.transform(X_test)

In [17]:
y_pred = forest_clf.predict(X_test)

In [18]:
predDf = pd.DataFrame(y_pred, columns = ["Survived"], index = testDf['PassengerId'])

In [19]:
predDf.to_csv('titanicpredictions.csv')