In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

PassengerId = test['PassengerId']
train.head(5)

In [None]:
train['Ticket_type'] = train['Ticket'].apply(lambda x: x[0:3])
train['Ticket_type'] = train['Ticket_type'].astype('category')
train['Ticket_type'] = train['Ticket_type'].cat.codes

test['Ticket_type'] = test['Ticket'].apply(lambda x: x[0:3])
test['Ticket_type'] = test['Ticket_type'].astype('category')
test['Ticket_type'] = test['Ticket_type'].cat.codes

train.head(3)

In [None]:
full_data = [train,test]

#gives the length of the name
train['Words_Count'] = train['Name'].apply(lambda x:len(x.split()))
test['Words_Count'] = test['Name'].apply(lambda x:len(x.split()))


In [None]:
#Feature that tells whether a passenger had a cabin on Titanic
train['Has_Cabin'] = train['Cabin'].apply(lambda x:0 if type(x)==float else 1)
test['Has_Cabin'] = test['Cabin'].apply(lambda x:0 if type(x)==float else 1)


In [None]:
#creating a new feature family size as a combination of SibSp and Parc
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
#creating a new feature isAlone
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize']== 1, 'IsAlone'] = 1


In [None]:
mode = train['Embarked'].mode()
mode

In [None]:
#Remove all Null values in the Embarked column

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
plt.boxplot(train['Fare'])
plt.plot()
plt.show()

**As we see there is lots of outliers in Fair feature so we use meadian for filling missing value**

In [None]:
#Remove all Nulls in the Fare column and create a new feature CategoeicalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    

In [None]:
train['CategoricalFare'] = pd.qcut(train['Fare'],6)

In [None]:
#creating new feature CAtegoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std , age_avg + age_std , size = age_null_count)
    
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

train['CategoricalAge'] = pd.cut(train['Age'], 6)

In [None]:
#Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.',name)
    #if the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

#create a new feature Title 
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

#Group all non-comman titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
    
    dataset['Title'] = dataset['Title'].replace('Mlle','Miss')
    dataset['Title'] = dataset['Title'].replace('Ms','Miss')
    dataset['Title'] = dataset['Title'].replace('Mme','Mrs')
    

In [None]:
train.head(5)

In [None]:
train['Embarked'].unique()

In [None]:
#Mapping the dataset

for dataset in full_data:
    #Sex mapping
    dataset['Sex'] = dataset['Sex'].map({'female':0, 'male':1}).astype(int)
    
    #Mapping Titles
    dataset['Title'] = dataset['Title'].map({"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Rare":5})
    dataset['Title'] = dataset['Title'].fillna(0)
    #Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S':0,'C':1,'Q':2}).astype(int)
    
    #Mapping Fare
    dataset.loc[(dataset['Fare'] < 6), 'Fare'] = 0
    dataset.loc[(dataset['Fare'] >= 6) & (dataset['Fare'] < 12), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] >=12) & (dataset['Fare'] < 18), 'Fare'] = 2
    dataset.loc[(dataset['Fare'] >=18) & (dataset['Fare'] <24), 'Fare'] = 3
    dataset.loc[(dataset['Fare'] >=24) & (dataset['Fare'] <30), 'Fare'] = 4
    dataset.loc[(dataset['Fare'] >=30), 'Fare'] = 5
    
    #Mapping Age
    dataset.loc[dataset['Age'] < 13, 'Age'] = 0
    dataset.loc[(dataset['Age'] >=13) & (dataset['Age']<26), 'Age'] =1
    dataset.loc[(dataset['Age'] >=26) & (dataset['Age']<39), 'Age'] = 2
    dataset.loc[(dataset['Age'] >= 39) & (dataset['Age'] < 52), 'Age'] = 3
    dataset.loc[(dataset['Age'] >= 52) & (dataset['Age'] < 65), 'Age'] = 4
    dataset.loc[ dataset['Age'] >= 65, 'Age'] = 5 

In [None]:
#Feature Selection
drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp']
train = train.drop(drop_elements, axis=1)
train = train.drop(['CategoricalAge','CategoricalFare'], axis = 1)
test = test.drop(drop_elements,axis=1)

In [None]:
train.head(3)

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
sns.heatmap(train.astype(float).corr(),annot=True)

In [None]:
y_train = train.iloc[:,0]

In [None]:
y_train = y_train.values

In [None]:
y_train

In [None]:
train = train.drop(['Survived'],axis = 1)

In [None]:
#x_train = train.iloc[:,[0,1,4,7,8,10,11]].values

In [None]:
x_train = train.values

In [None]:
x_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
Classification = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=3, min_samples_split=2)

In [None]:
Classification.fit(x_train,y_train)

In [None]:
score = round(Classification.score(x_train,y_train)*100,2)
score

In [None]:
x_test = test.values

In [None]:
Prediction = Classification.predict(x_test)

In [None]:
Submission = pd.DataFrame({ 'PassengerId': PassengerId,'Survived': Prediction })

In [None]:
Submission.to_csv("Submit.csv",index = False)