In [20]:
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [16]:
titanic = sns.load_dataset('titanic')

In [17]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [18]:
titanic.drop(['deck'],axis=1, inplace =True)

In [25]:
titanic['age'] = np.where(titanic['age'].isna() == True,titanic['age'].mean(), titanic['age'] )

In [26]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64

In [28]:
titanic.dropna(inplace=True)

In [29]:
titanic.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [34]:
titanic['sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: sex, Length: 889, dtype: object

In [38]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(titanic['sex'])
titanic['sex'] = le.transform(titanic['sex'])
# 1 is male , 0 is female 

In [40]:
titanic.dtypes

survived          int64
pclass            int64
sex               int32
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
embark_town      object
alive            object
alone              bool
dtype: object

In [44]:
le = preprocessing.LabelEncoder()
le.fit(titanic['embarked'])
titanic['embarked'] = le.transform(titanic['embarked'])
# S is 2, C is 0, Q is 1

In [47]:
le = preprocessing.LabelEncoder()
le.fit(titanic['who'])
titanic['who'] = le.transform(titanic['who'])
# man is 1, woman is 2, child = 0 

In [50]:
le = preprocessing.LabelEncoder()
le.fit(titanic['embark_town'])
titanic['embark_town'] = le.transform(titanic['embark_town'])
# Cherbourg = 0, Southampton = 2, Queenstown = 1

In [53]:
le = preprocessing.LabelEncoder()
le.fit(titanic['alive'])
titanic['alive'] = le.transform(titanic['alive'])
# yes = 1, no = 0 

In [57]:
le = preprocessing.LabelEncoder()
le.fit(titanic['class'])
titanic['class'] = le.transform(titanic['class'])
# First = 0, Second = 1, Third = 2 

In [61]:
le = preprocessing.LabelEncoder()
le.fit(titanic['adult_male'])
titanic['adult_male'] = le.transform(titanic['adult_male'])
# True = 1, False = 0 

In [64]:
le = preprocessing.LabelEncoder()
le.fit(titanic['alone'])
titanic['alone'] = le.transform(titanic['alone'])
# True = 1, False = 0

In [66]:
titanic.dtypes

survived         int64
pclass           int64
sex              int32
age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int32
class            int32
who              int32
adult_male       int64
embark_town      int32
alive            int32
alone            int64
dtype: object

In [76]:
titanic.drop(['alive'], axis=1, inplace=True)

In [86]:
y = titanic['survived']
X = titanic.drop(['survived'], axis=1)

In [87]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [88]:
rfe_selector = RFE(estimator=RandomForestClassifier())
m=rfe_selector.fit(X, y)
X.columns[m.get_support()]
print("Num Features: %s" % (m.n_features_))
print("Selected Features: %s" % (m.support_))
print("Feature Ranking: %s" % (m.ranking_))

Num Features: 6
Selected Features: [ True  True  True False False  True False False  True  True False False]
Feature Ranking: [1 1 1 2 4 1 5 3 1 1 7 6]


In [100]:
features = X.loc[:, m.support_].columns

In [92]:
SFM = SelectFromModel(estimator=RandomForestClassifier())
s=SFM.fit(X, y)

In [93]:
feature_idx = s.get_support()
feature_name = X.columns[feature_idx]
feature_name 

Index(['sex', 'age', 'fare', 'adult_male'], dtype='object')

##### Train_test_split : RFC

In [95]:
from sklearn.model_selection import train_test_split

In [131]:
y = titanic['survived']
x = titanic[features]

In [132]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=5)

In [133]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
acc_rfc = rfc.score(X_test, y_test)
print('The accuracy of the Random Forest Classifier is:', acc_rfc * 100, '%')

The accuracy of the Random Forest Classifier is: 85.39325842696628 %


##### Trying another method : SVM

In [151]:
from sklearn import metrics
from sklearn import svm

classifier = svm.SVC(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

metrics.accuracy_score(y_pred, y_test)

0.6573033707865169

##### Checking hyperparameters

#### Grid search

In [141]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [142]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), #take a model like RFC responsible for searching
                           {
                              'n_estimators':np.arange(5,400,5)
                            
                            },cv=5, scoring="accuracy",verbose=1,n_jobs=-1 #my goal is Accuracy , we want verbosity for more details
                           ) #cv = crossvalidation, how many folds you want
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 79 candidates, totalling 395 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'n_estimators': array([  5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,  65,
        70,  75,  80,  85,  90,  95, 100, 105, 110, 115, 120, 125, 130,
       135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195,
       200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260,
       265, 270, 275, 280, 285, 290, 295, 300, 305, 310, 315, 320, 325,
       330, 335, 340, 345, 350, 355, 360, 365, 370, 375, 380, 385, 390,
       395])},
             scoring='accuracy', verbose=1)

In [143]:
grid_search.best_params_

{'n_estimators': 395}

In [144]:
grid_search.best_score_

0.8073475819954693

#### Random search

In [146]:
rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=0),
                           {
                              'n_estimators':np.arange(5,400,5)
                            
                            },cv=5, scoring="accuracy",verbose=1,n_jobs=-1
                           )
rand_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   n_jobs=-1,
                   param_distributions={'n_estimators': array([  5,  10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,  65,
        70,  75,  80,  85,  90,  95, 100, 105, 110, 115, 120, 125, 130,
       135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195,
       200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260,
       265, 270, 275, 280, 285, 290, 295, 300, 305, 310, 315, 320, 325,
       330, 335, 340, 345, 350, 355, 360, 365, 370, 375, 380, 385, 390,
       395])},
                   scoring='accuracy', verbose=1)

In [147]:
rand_search.best_params_

{'n_estimators': 345}

In [148]:
rand_search.best_score_

0.8059391312912441

### Chosing model RFC - running with best fitting parameters

In [149]:
rfc = RandomForestClassifier(n_estimators=395, verbose=1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
acc_rfc = rfc.score(X_test, y_test)
print('The accuracy of the Random Forest Classifier is:', acc_rfc * 100, '%')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


The accuracy of the Random Forest Classifier is: 84.26966292134831 %


[Parallel(n_jobs=1)]: Done 395 out of 395 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 395 out of 395 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 395 out of 395 | elapsed:    0.0s finished


### Input new data

In [136]:
jack_dawson = pd.DataFrame({'pclass':3, 'sex':1, 'age': 20, 'fare': 0, 'who': 1, 'adult_male':1 }, index=[0])

In [137]:
jack_dawson

Unnamed: 0,pclass,sex,age,fare,who,adult_male
0,3,1,20,0,1,1


In [150]:
y_pred = rfc.predict(jack_dawson)
prediction

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 395 out of 395 | elapsed:    0.0s finished


array([0], dtype=int64)