# Random Forest
> Random Forest is an ensemble learning method that is flexible and easy to use.It is a non linear ensemble learning binary classifier which neglects the correlation of the data. Random forest is an advance version of decision tree. Random forests are a way of averaging multiple deep decision trees, trained on different parts of the same training set, with the goal of reducing the variance. This comes at the expense of a small increase in the bias and some loss of interpret ability, but generally greatly boosts the performance in the final model. Instead of searching for the most important feature while splitting a node, it searches for the best feature among a random subset of features. This results in a wide diversity that generally results in a better model.

# Grid Search
> Grid searching is a module that performs parameter tuning which is the process of selecting the values for a model’s parameters that maximize the accuracy of the model. Grid Search does this by fitting every combination of parameters and selecting the best parameters by which model had the best score.

# *Please upvote the kernel if you find it insightful*

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Import train and test data

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
dataset = pd.concat([train, test], ignore_index = True)
#Retrieve Passenger ID from test set, used for submission
PassengerId = test['PassengerId']

In [None]:
#Check null values and missing values
dataset = dataset.fillna(np.nan)
dataset.isnull().sum()

In [None]:
#Check missing values in train set
train.info()
train.isnull().sum()

In [None]:
# check the first five information of the train set
train.head()

In [None]:
# Check the data types of every column
train.dtypes

In [None]:
# Generate descriptive statistics that summarize the central tendency, dispersion and shape of a dataset's distribution
train.describe()

# Data visualization

In [None]:
sns.barplot(x="Sex", y="Survived", data=train, palette='Set3')
print("Percentage of females that could survive: %.2f" %(train['Survived'][train['Sex'] == 'female'].value_counts(normalize = True)[1]*100))
print("Percentage of females that could survive: %.2f" %(train['Survived'][train['Sex'] == 'male'].value_counts(normalize = True)[1]*100))

In [None]:
# Pclass feature
# The higher the class is, the high probability survive
sns.barplot(x='Pclass', y='Survived', data=train, palette='Set3')
print("Percentage of Pclass = 1, survived probability: %.2f" %(train['Survived'][train['Pclass']==1].value_counts(normalize = True)[1]*100))
print("Percentage of Pclass = 2, survived probability: %.2f" %(train['Survived'][train['Pclass']==2].value_counts(normalize = True)[1]*100))
print("Percentage of Pclass = 3, survived probability: %.2f" %(train['Survived'][train['Pclass']==3].value_counts(normalize = True)[1]*100))


In [None]:
# SibSp Feature
# With a suitable number of siblings and spouse, he/she will have a high survival rate.
sns.barplot(x="SibSp", y="Survived", data=train, palette='Set3')

In [None]:
# Parch Feature 
# With a suitable number of parents and children, he/she will have a high survival rate.
sns.barplot(x="Parch", y="Survived", data=train, palette='Set3')

In [None]:
# Age Feature
# Child and adolecent will have a higher survival rate
age = sns.FacetGrid(train, hue="Survived",aspect=2)
age.map(sns.kdeplot,'Age',shade= True)
age.set(xlim=(0, train['Age'].max()))
age.add_legend()

In [None]:
# Fare Feature
# Passengers who paid higher fare had higher survival rate.
fare = sns.FacetGrid(train, hue="Survived",aspect=2)
fare.map(sns.kdeplot,'Fare',shade= True)
fare.set(xlim=(0, 200))
fare.add_legend()

In [None]:
# Title Feature
# Retrieve the title from passengers name, classify them into six kinds, which are officer, royalty, Mrs, Miss, Mr, and Master.
dataset['Title'] = dataset['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
Title_Dict = {}
Title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
Title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
Title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
Title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
Title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
Title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))
dataset['Title'] = dataset['Title'].map(Title_Dict)
sns.barplot(x="Title", y="Survived", data=dataset, palette='Set3')

In [None]:
# FamilyLabel
# Add new feature FamilyLabel
# Calculate the family size = Sibsp+Parch+1
dataset['FamilySize']=dataset['SibSp']+dataset['Parch']+1
sns.barplot(x="FamilySize", y="Survived", data=dataset, palette='Set3')

In [None]:
# Based on the family size, classify them into three groups
def Family_label(s):
    if (s >= 2) & (s <= 4):
        return 2
    elif ((s > 4) & (s <= 7)) | (s == 1):
        return 1
    elif (s > 7):
        return 0
dataset['FamilyLabel']=dataset['FamilySize'].apply(Family_label)
sns.barplot(x="FamilyLabel", y="Survived", data=dataset, palette='Set3')

In [None]:
# Deck Feature
# Fill the missing cabin as Unknown
# Retrieve the capital words as Deck number
dataset['Cabin'] = dataset['Cabin'].fillna('Unknown')
dataset['Deck']= dataset['Cabin'].str.get(0)
sns.barplot(x="Deck", y="Survived", data=dataset, palette='Set3')

In [None]:
# TicketGroup Feature
# Calculate the number of passengers who has the same ticket number
Ticket_Count = dict(dataset['Ticket'].value_counts())
dataset['TicketGroup'] = dataset['Ticket'].apply(lambda x:Ticket_Count[x])
sns.barplot(x='TicketGroup', y='Survived', data=dataset, palette='Set3')

In [None]:
# Classify the TicketGroup into three kinds
def Ticket_Label(s):
    if (s >= 2) & (s <= 4):
        return 2
    elif ((s > 4) & (s <= 8)) | (s == 1):
        return 1
    elif (s > 8):
        return 0

dataset['TicketGroup'] = dataset['TicketGroup'].apply(Ticket_Label)
sns.barplot(x='TicketGroup', y='Survived', data=dataset, palette='Set3')

# Fill the missing values

In [None]:
# Fill the missing age value, use feature Pclass, Sex and Title and random forest regressor model to predict 
age = dataset[['Age','Pclass','Sex','Title']]
age = pd.get_dummies(age)
# print(age)
known_age = age[age.Age.notnull()].values
null_age = age[age.Age.isnull()].values
x = known_age[:, 1:]
y = known_age[:, 0]
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(x, y)
predictedAge = rf.predict(null_age[:, 1:])
dataset.loc[(dataset.Age.isnull()),'Age'] = predictedAge

In [None]:
dataset[dataset['Embarked'].isnull()]

In [None]:
C = dataset[(dataset['Embarked']=='C') & (dataset['Pclass'] == 1)]['Fare'].median()
print(C)
S = dataset[(dataset['Embarked']=='S') & (dataset['Pclass'] == 1)]['Fare'].median()
print(S)
Q = dataset[(dataset['Embarked']=='S') & (dataset['Pclass'] == 1)]['Fare'].median()
print(Q)
dataset['Embarked'] = dataset['Embarked'].fillna('C')

In [None]:
dataset[dataset['Fare'].isnull()]

In [None]:
fare=dataset[(dataset['Embarked'] == "S") & (dataset['Pclass'] == 3)].Fare.median()
dataset['Fare']=dataset['Fare'].fillna(fare)

In [None]:
dataset['Surname']=dataset['Name'].apply(lambda x:x.split(',')[0].strip())
Surname_Count = dict(dataset['Surname'].value_counts())
dataset['FamilyGroup'] = dataset['Surname'].apply(lambda x:Surname_Count[x])
Female_Child_Group=dataset.loc[(dataset['FamilyGroup']>=2) & ((dataset['Age']<=12) | (dataset['Sex']=='female'))]
Male_Adult_Group=dataset.loc[(dataset['FamilyGroup']>=2) & (dataset['Age']>12) & (dataset['Sex']=='male')]

In [None]:
Female_Child=pd.DataFrame(Female_Child_Group.groupby('Surname')['Survived'].mean().value_counts())
Female_Child.columns=['GroupCount']
Female_Child

In [None]:
Male_Adult=pd.DataFrame(Male_Adult_Group.groupby('Surname')['Survived'].mean().value_counts())
Male_Adult.columns=['GroupCount']
Male_Adult

In [None]:
Female_Child_Group=Female_Child_Group.groupby('Surname')['Survived'].mean()
Dead_List=set(Female_Child_Group[Female_Child_Group.apply(lambda x:x==0)].index)
print(Dead_List)
Male_Adult_List=Male_Adult_Group.groupby('Surname')['Survived'].mean()
Survived_List=set(Male_Adult_List[Male_Adult_List.apply(lambda x:x==1)].index)
print(Survived_List)

In [None]:
train=dataset.loc[dataset['Survived'].notnull()]
test=dataset.loc[dataset['Survived'].isnull()]
test.loc[(test['Surname'].apply(lambda x:x in Dead_List)),'Sex'] = 'male'
test.loc[(test['Surname'].apply(lambda x:x in Dead_List)),'Age'] = 60
test.loc[(test['Surname'].apply(lambda x:x in Dead_List)),'Title'] = 'Mr'
test.loc[(test['Surname'].apply(lambda x:x in Survived_List)),'Sex'] = 'female'
test.loc[(test['Surname'].apply(lambda x:x in Survived_List)),'Age'] = 5
test.loc[(test['Surname'].apply(lambda x:x in Survived_List)),'Title'] = 'Miss'

# Transfer the features into numerial values.

In [None]:
# Get trainset and testset based on whether the value of Survived is null or not.
dataset = pd.concat([train, test])
dataset=dataset[['Survived','Pclass','Sex','Age','Fare','Embarked','Title','FamilyLabel','Deck','TicketGroup']]
dataset=pd.get_dummies(dataset)
trainset=dataset[dataset['Survived'].notnull()]
testset=dataset[dataset['Survived'].isnull()].drop('Survived',axis=1)
X = trainset.values[:,1:]
Y = trainset.values[:,0]

# Model training and prediction
Use grid search to find the best parameter of random forest classifier.

In [None]:
pipe=Pipeline([('select',SelectKBest(k=20)), 
               ('classify', RandomForestClassifier(random_state = 10, max_features = 'sqrt'))])

param_test = {'classify__n_estimators':list(range(20,50,2)), 
              'classify__max_depth':list(range(3,60,3))}
gsearch = GridSearchCV(estimator = pipe, param_grid = param_test, scoring='accuracy', cv=10)
gsearch.fit(X,Y)
print(gsearch.best_params_, gsearch.best_score_)

In [None]:
select = SelectKBest(k = 20)
clf = RandomForestClassifier(random_state = 10, warm_start = True, 
                                  n_estimators = 30,
                                  max_depth = 6, 
                                  max_features = 'sqrt')
pipeline = make_pipeline(select, clf)
pipeline.fit(X, Y)

# Cross Validation

In [None]:
cv_score = model_selection.cross_val_score(pipeline, X, Y, cv= 10)
print("CV Score : Mean - %.7g | Std - %.7g " % (np.mean(cv_score), np.std(cv_score)))

# Submission

In [None]:
predictions = pipeline.predict(testset)
submission = pd.DataFrame({"PassengerId": PassengerId, "Survived": predictions.astype(np.int32)})
submission.to_csv("submission.csv", index=False)