In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train= pd.read_csv('../input/titanic/train.csv',index_col='PassengerId')
test= pd.read_csv('../input/titanic/test.csv' ,index_col='PassengerId')

In [None]:
train= pd.concat([train, test])

In [None]:
train

# Checking for Null Values

In [None]:
train.isnull().sum()

In [None]:
train['Age'].fillna((train['Age'].mean()), inplace=True) 

In [None]:
train.drop('Cabin', inplace=True, axis=1)

# Feature Engineering

Extracting Family Name from 'Name' column

In [None]:
train['Family_name']=train['Name'].str.split(', ').str[0]
train

Extracting Title from 'Name' column

In [None]:
train['Title']=train['Name'].str.split(', ').str[1].str.split('.').str[0]
train['Title'].unique()

Minimizing the number of unique values in 'Title' column by grouping similar terms (assigning similar terms to a single category)

In [None]:
train['Title'] =train['Title'].replace(['Ms','Mlle'], 'Miss')
train['Title'] = train['Title'].replace(['Mme','Dona','the Countess','Lady'], 'Mrs')
train['Title'] =train['Title'].replace(['Rev','Mlle','Jonkheer','Dr','Capt','Don','Col','Major','Sir'], 'Mr')

Manual Label Encoding 

In [None]:
cleanup_nums = { "Title": {"Mr": 0, "Mrs": 1, "Miss": 2, "Master": 3 } }
train.replace(cleanup_nums, inplace=True)

Deriving the size of the family based on 'SibSp' and 'Parch' columns

In [None]:
train['FamilySize']= train['SibSp']+train['Parch']+1
train

Binning the age column into categories 

In [None]:
bins = [0, 2, 18, 35, 65, np.inf]
names = ['<2', '2-18', '18-35', '35-65', '65+']

train['AgeRange'] = pd.cut(train['Age'], bins, labels=names)

NumberedAgeCategories = {'<2':0 , '2-18':1, '18-35':2, '35-65':3, '65+':4}
train['AgeRange']=train['AgeRange'].map(NumberedAgeCategories)  
train['AgeRange']=pd.to_numeric(train['AgeRange'])
train

# Testing Multiple Models

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

Models to be tested: <br/>
1- Logistic Regression <br/>
2- Decision Trees

In [None]:
train.info()

In [None]:
def LabelEncoder_(data):
    label_encoder = LabelEncoder()
    for col in data.columns[data.dtypes == "object"]:
        data[col] = label_encoder.fit_transform(data[col].astype('str'))
    return data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


train= LabelEncoder_(train)

# drop rows with null values    
train.dropna(inplace=True)


# training data
X = train.drop('Survived', axis=1)
y = train['Survived']


#splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
clf = LogisticRegression(max_iter=100000,random_state=0).fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
clf = DecisionTreeClassifier(random_state=0, max_depth= 7).fit(X_train,y_train)
clf.score(X_test,y_test)

# Ensemble Methods

3- Random Forest <br/>
4- XGBoost<br/>
5- Extra Trees <br/>

In [None]:
clf = RandomForestClassifier(random_state=0,max_depth= 7).fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
clf =  XGBClassifier(random_state=0,eval_metric='mlogloss', use_label_encoder=False, objective='binary:logistic').fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
clf =  ExtraTreesClassifier(random_state=0, max_depth= 7).fit(X,y)
clf.score(X_test,y_test)

# Cross Validation

In [None]:
ExtraTrees_clf= ExtraTreesClassifier(random_state=0, max_depth= 7)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

**Without cross-validation:**

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf =  ExtraTrees_clf.fit(X_train_transformed, y_train)

X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)

**With cross-validation:**

In [None]:
pipeline = make_pipeline(preprocessing.StandardScaler(), ExtraTrees_clf)

In [None]:
scores = cross_val_score(pipeline, X,y, cv=5)
scores

In [None]:
scores.mean()

# Feature Selection after model training

In [None]:
arr= dict(zip(X_train.columns, ExtraTrees_clf.feature_importances_)) ## this is used to write the feature name next to the probability
data= pd.DataFrame.from_dict(arr,orient='index', columns=['importance'])
data.sort_values(['importance'], ascending=False, inplace=True)

In [None]:
data.plot.bar(y="importance", rot=70, title="Extra Trees Features with their corresponding importance values")

Check this for more feature selection techniques: https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/


# Algorithm Tuning using GridSearchCV

In [None]:
ExtraTrees_clf_temp =  ExtraTreesClassifier(random_state=0, max_depth= 7).fit(X_train,y_train)
ExtraTrees_clf_temp.score(X_test,y_test)

This is the accuracy score without applying cv so it is unlikely to be a reliable result. 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

**GridSeachCV**

In [None]:
ExtraTrees_clf= ExtraTreesClassifier(random_state=0)

In [None]:
params = {"max_depth": [3,7,8], "n_estimators": [50,70,100,150,200], "criterion":['gini', 'entropy'], "min_samples_split":[2,3,4,5,6]}

grid_clf_acc = GridSearchCV(ExtraTrees_clf, param_grid=params, scoring = None)
grid_clf_acc.fit(X, y) 

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score: (CV score=%0.3f) ' % grid_clf_acc.best_score_)

**GridSearch with pipelining:**

In [None]:
pipe = Pipeline(steps=[('StandardScaler', preprocessing.StandardScaler()),('pca', PCA()), ('ExtraTrees', ExtraTrees_clf)])

param_grid = {'ExtraTrees__max_depth': [3,7],
              'ExtraTrees__min_samples_leaf': [1,2],
              'ExtraTrees__criterion': ["gini", "entropy"],'pca__n_components': [5]}

search = GridSearchCV(pipe, param_grid, scoring=None)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

PCA is not a good fit in this case, so we might disregard using it for this dataset.