In [1]:
# Importing libreries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from ipywidgets import widgets
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import warnings
import pickle
warnings.filterwarnings('ignore')

In [2]:
# Load data

train_data = pd.read_csv('train.csv')
test_in_data = pd.read_csv('test_in.csv')
test_out_data = pd.read_csv('test_out.csv')

In [3]:
# Overview of dataset

print(f"Rows/Instances: {train_data.shape[0]}")
print(f"\nColumns/Features: {train_data.shape[1]}")
print(f"\nColumn Names: {train_data.columns}")
print(f"\nTotal missing values: {train_data.isnull().values.sum()}")
print(f"\nUnique values: \n{train_data.nunique()}")

Rows/Instances: 891

Columns/Features: 12

Column Names: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Total missing values: 866

Unique values: 
PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64


In [4]:
# Number of Missing Values

column_names = train_data.columns
for column in column_names:
    print(column + ' - ' + str(train_data[column].isnull().sum()))

PassengerId - 0
Survived - 0
Pclass - 0
Name - 0
Sex - 0
Age - 177
SibSp - 0
Parch - 0
Ticket - 0
Fare - 0
Cabin - 687
Embarked - 2


In [5]:
# Age is not correlated with 'Sex' and 'Fare'
# Filling Age with the median age of similar rows from 'Pclass', 'SibSp' and 'Parch'
# If there are no similar rows, fill the age with the median age of total dataset

NaN_indexes = train_data['Age'][train_data['Age'].isnull()].index
for i in NaN_indexes:
    pred_age = train_data['Age'][((train_data.SibSp == train_data.iloc[i]["SibSp"]) & (train_data.Parch == train_data.iloc[i]["Parch"]) & (train_data.Pclass == train_data.iloc[i]["Pclass"]))].median()
    if not np.isnan(pred_age):
        train_data['Age'].iloc[i] = pred_age
    else:
        train_data['Age'].iloc[i] = train_data['Age'].median()
        
NaN_indexes = test_in_data['Age'][test_in_data['Age'].isnull()].index
for i in NaN_indexes:
    pred_age = test_in_data['Age'][((test_in_data.SibSp == test_in_data.iloc[i]["SibSp"]) & (test_in_data.Parch == test_in_data.iloc[i]["Parch"]) & (test_in_data.Pclass == test_in_data.iloc[i]["Pclass"]))].median()
    if not np.isnan(pred_age):
        test_in_data['Age'].iloc[i] = pred_age
    else:
        test_in_data['Age'].iloc[i] = test_in_data['Age'].median()

In [6]:
# Drop 'Ticket', 'PassengerId', 'Name', 'SibSp', 'Parch', 'Fare', 'Embarked' and 'Cabin' columns

train_data = train_data.drop(columns=['Ticket', 'PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Fare', 'Embarked'])
test_in_data = test_in_data.drop(columns=['Ticket', 'PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [7]:
# Convert 'Sex' to Numerical Value

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_in_data['Sex'] = test_in_data['Sex'].map({'male':0, 'female':1})

In [8]:
# Building Model

X_train = train_data.drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test  = test_in_data

In [10]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

acc_random_forest

88.44