In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = sns.load_dataset('titanic')

In [3]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
X = data.drop('survived', 1)
y = data.survived

## Spliter

1. X_train and X_validation
2. y_train and y_validation

**PS:** Don't use transform training and validation datasets together


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state=23)

In [6]:
X_train['sex'] = X_train.sex.map({'male':0,
                'female':1})

X_train.embarked = X_train.embarked.map({'S':1,
                                        'C':2,
                                        'Q':3})

X_train['class'] = X_train['class'].map({'First':1,
                                   'Second':2,
                                   'Third':3
                                  })



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [7]:
drop_columns = ['who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']

X_train = X_train.drop(drop_columns, 1)

In [8]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class
151,1,1,22.0,1,0,66.6,1.0,1
753,3,0,23.0,0,0,7.8958,1.0,3
746,3,0,16.0,1,1,20.25,1.0,3
684,2,0,60.0,1,1,39.0,1.0,2
887,1,1,19.0,0,0,30.0,1.0,1


In [9]:
X_train.isna().sum()

pclass        0
sex           0
age         135
sibsp         0
parch         0
fare          0
embarked      1
class         0
dtype: int64

In [10]:
X_train.age.fillna(X_train.age.median(), inplace=True)

In [11]:
X_train.embarked.fillna(1, inplace=True)

# Feature Union

In [12]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [13]:
pca = PCA(n_components=0.95)
skb = SelectKBest(chi2, k=5)
combined_features = FeatureUnion(transformer_list=[("pca", pca), ("skb",skb)],n_jobs=-1)

In [14]:
data = combined_features.fit_transform(X_train, y_train)

In [15]:
X_train, y_train = data[:, :-1], data[:,-1]

In [16]:
X_train.shape

(712, 6)

In [17]:
y_train.shape

(712,)

In [18]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=20)

In [19]:
y_train=y_train.astype('int')

In [20]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [22]:
model.score(X_train, y_train)

1.0

## Validation Transformation

In [23]:
X_val['sex'] = X_val.sex.map({'male':0,
                'female':1})

X_val.embarked = X_val.embarked.map({'S':1,
                                        'C':2,
                                        'Q':3})

X_val['class'] = X_val['class'].map({'First':1,
                                   'Second':2,
                                   'Third':3
                                  })

drop_columns = ['who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']

X_val = X_val.drop(drop_columns, 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [30]:
X_val.embarked.fillna(1, inplace=True)
X_val.age.fillna(X_val.age.median(), inplace=True)


pca = PCA(n_components=0.95)
skb = SelectKBest(chi2, k=5)
combined_features = FeatureUnion(transformer_list=[("pca", pca), ("skb",skb)],n_jobs=-1)



dataVal = combined_features.fit_transform(X_val, y_val)
X_val, y_val = dataVal[:, :-1], dataVal[:,-1]

y_val=y_val.astype('int')



model.score(X_val, y_val)

1.0

In [32]:
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [33]:
import joblib
joblib.dump(model, 'RandomFor.joblib')

['RandomFor.joblib']