In [1]:
from numpy.random import randint
import numpy as np # for importing numpy
import pandas as pd # for importing pandas
from sklearn.utils import shuffle # for shuffling the data
from scipy import stats
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'numpy'

# Exploratory data analysis

In [None]:
train = pd.read_csv('../data/train.csv')

In [None]:
display(train.head())
display(train.describe())
display(train.info())

In [None]:
def plot1dpie(ax,labels_values, labels, title, x,y):
	ax[y].pie(labels_values, labels = labels, autopct='%1.1f%%', shadow = True, startangle = 90)
	ax[y].title.set_text(title)

def plot2dpie(ax,labels_values, labels, title,x,y):
	# ca
	ax[x,y].pie(labels_values, labels = labels, autopct='%1.1f%%', shadow = True, startangle = 90)
	ax[x,y].title.set_text(title)


def plotpie(x,y,ax,labels_values, labels, title):
	# check if ax is 2d or 1d
	if len(ax.shape) == 1:
		plot1dpie(ax,labels_values, labels, title, x,y)
	else:
		plot2dpie(ax,labels_values, labels, title,x,y)

def multipie(data, titles, target_col, labels, array_cols, icols = -1, irows = -1, pivot_column = None):
	"""
	%%
	data in the form of 
	
	|Survived|1  |2 |3  |
	|1       |136|87|119|
	|1       |80 |97|372|
	
	titles = ['1 survival rate','2 survival rate','3 survival rate']
	target_col = 'Survived'
	labels = ['Survived','Died']
	array_cols = ['1','2','3']
	icols, irows = 2,2
	pivot_column = 'Pclass' -- if Not None, then the data will be pivoted
	"""

	if pivot_column is not None:
		tmp = data.pivot_table(index = target_col, columns = pivot_column, aggfunc = 'size',fill_value = 0).reset_index()
		tmp = tmp.rename_axis(None, axis = 1)
		data = tmp


	input_cols = len(array_cols) + 1
	if icols == -1:
		ncols = 3
	else:
		ncols = icols

	if irows == -1:
		nrows = (input_cols - 1) // ncols  + 1
	else:
		nrows  = irows

	fig, ax = plt.subplots(nrows = nrows, ncols = ncols, figsize=(8,2 * nrows))
	# first plot the sum of the target column
	labels_values = [0] * len(data)
	# get the sum of all the columns for each row and store it in a list
	# exclude the target_col column from the sums

	for i in range(len(data)):
		for j in range(len(array_cols)):
		
			column = array_cols[j]
			labels_values[i] += data.loc[i,column]


	plotpie(0,0,ax,labels_values, labels, title = 'Sum of ' + target_col)
	# now plot all the array_cols with the target_col individually
	for i in range(1, len(array_cols) + 1):
		labels_values = [0] * len(labels)
		for j in range(len(data)):
			labels_values[j] = data[array_cols[i-1]].iloc[j]
		row = (i) // ncols
		col = (i) % ncols
		plotpie(row,col,ax,labels_values, labels, title = titles[i-1])
	# delete unused plots axes
	for i in range(len(array_cols) + 1, ncols * nrows):
		row = i // ncols
		col = i % ncols
		fig.delaxes(ax[row,col])


	plt.tight_layout()
	plt.show()

In [None]:
survived_count = train['Survived'].value_counts()

In [None]:
plt.pie(survived_count, labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
plt.show()

# Pclass

In [None]:
# pclass column contains the socioeconomic status of the passengers. It might be predictve for our model
# 1 = upper
# 2 = middle
# 3 = lower

In [None]:
multipie(train, ['1 survival rate','2 survival rate','3 survival rate'], 'Survived', ['Survived','Died'], [1,2,3],icols = 2, irows = 2, pivot_column = 'Pclass')

# Title

In [None]:
# Mr -> mister is an adult man regardless of his marital status
# Miss -> mujer soltera
# Mrs -> mujer casada
# Dr -> doctor
# Ms -> mujer ambiguo
# col -> coronel
# major -> major
# Rev -> reverendo
# Dona -> Madam
# Master -> joven
# Mlle -> Mademoiselle
#
# royalty -> Don, Jonkheer, Sir, Dona, Mlle
# military -> Capt, Col, Major
# single_women -> Miss, Ms
# married_women -> Mrs
# reverend -> Rev
# master -> young man

In [None]:
# we read the test data
test = pd.read_csv('../data/test.csv')
# we extract the title
test['Title'] = test['Name'].str.split(', ', expand = True)[1].str.split('.', expand = True)[0]
train['Title'] = train['Name'].str.split(', ', expand = True)[1].str.split('.', expand = True)[0]

In [None]:
# create a title_category column
train['Title_category'] = train['Title']
test['Title_category'] = test['Title']

In [None]:
# replace all the titles with the new categories
# royalty -> Don, Jonkheer, Sir, Dona, Mlle
# military -> Capt, Col, Major
# single_women -> Miss, Ms
# married_women -> Mrs
# young_man -> Master

full_data = [train, test]
for data in full_data:
	data['Title_category'] = data['Title_category'].replace(['Don', 'Jonkheer', 'Sir', 'Dona', 'Mlle','mme','Lady','the Countess','Mme'], 'Royalty')
	data['Title_category'] = data['Title_category'].replace(['Capt', 'Col', 'Major'], 'Military')
	data['Title_category'] = data['Title_category'].replace(['Miss','Ms'],'single_women')
	data['Title_category'] = data['Title_category'].replace(['Mrs'],'married_women')
	data['Title_category'] = data['Title_category'].replace(['Master'],'young_man')
	data['Title_category'] = data['Title_category'].replace(['Rev'],'Mr')

In [None]:
title_cat_vals = train['Survived'].groupby(train['Title_category']).mean().sort_values(ascending = False)

In [None]:
# convert the series to a dataframe
title_cat_vals = pd.DataFrame(title_cat_vals).reset_index()

In [None]:
# set the axis for multiple pie plots
fig, ax = plt.subplots(2,4, figsize = (10,5))
for i in range(len(title_cat_vals)):
	row = i // 4
	col = i % 4
	# plot the pie chart
	ax[row,col].pie([1 - title_cat_vals['Survived'].iloc[i], title_cat_vals['Survived'].iloc[i]], labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
	ax[row,col].set_title(title_cat_vals['Title_category'].iloc[i])

# clean the unused plots
fig.delaxes(ax[1,3])

plt.tight_layout()
plt.show()

In [None]:

# there are 177 missing values, we will imput them in feature engineering part. Now, let's look at the distribution of ages by surviving
# create two histplots for survived and not survived using seaborn
survived = train[train['Survived'] == 1]
died = train[train['Survived'] == 0]


h1 = sns.histplot(survived['Age'], kde = True, color = 'green')
h2 = sns.histplot(died['Age'], kde = True, color = 'red')
h1.set_title('Age distribution by surviving')
h1.set_xlabel('Age')
h1.set_ylabel('Count')
h1.legend(['Survived','Died'])
plt.show()

In [None]:
# is sex important for surviving?
multipie(train, ['Male Survival','Female Survival'],'Survived',['Survived','Died'], ['male','female'], pivot_column = 'Sex')

In [None]:
# SibSP -> Sibling or spouse
# Parch -> Parent or children
# I decided to create a new feature called family_size by summing SibSp and Parch columns
train['Family_size'] = train['SibSp'] + train['Parch']
# proportion of people survived for each family size
class_survived = train['Survived'].groupby(train['Family_size']).mean().sort_values(ascending = False)
class_survived = pd.DataFrame(class_survived).reset_index()
class_survived

In [None]:
fig, ax = plt.subplots(3,3)
for i in range(len(class_survived)):
	row = i // 3
	col = i % 3
	# plot the pie chart
	ax[row,col].pie([1 - class_survived['Survived'].iloc[i], class_survived['Survived'].iloc[i]], labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
	ax[row,col].set_title(class_survived['Family_size'].iloc[i])

plt.tight_layout()
plt.show()

ticket
I extracted only first letters of the tickets because I thought that they would indicate the ticket typ

In [None]:
# get the first letters of the tickets
train['Ticket_first'] = train['Ticket'].apply(lambda x: str(x)[0])
ticket_df = train.groupby('Ticket_first')['Survived'].mean().sort_values(ascending = False)
ticket_df = pd.DataFrame(ticket_df).reset_index()

In [None]:
fig, ax = plt.subplots(4,4, figsize = (12,6))

for i in range(len(ticket_df)):
	row = i // 4
	col = i%4
	ax[row, col].pie([1 - ticket_df['Survived'].iloc[i], ticket_df['Survived'].iloc[i]], labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
	ax[row,col].set_title(ticket_df['Ticket_first'].iloc[i])

plt.tight_layout()
plt.show()

# Fare

In [None]:
# we can plot a histogram to see the fare distribution
# plot the density plot of fare to the Survived
sns.kdeplot(train['Fare'])
plt.show()

there is also a correlation between ticket fares and surviving

# cabin

In [None]:
train['Cabin_first'] = train['Cabin'].apply(lambda x:str(x)[0])
# surviving rate of cabin first letters
cabin_sur = train.groupby('Cabin_first')['Survived'].mean().sort_values(ascending = False)
cabin_sur = pd.DataFrame(cabin_sur).reset_index()

In [None]:
fig,ax = plt.subplots(3,3)

for i in range(len(cabin_sur)):
	row = i // 3
	col = i%3
	ax[row,col].pie([1 - cabin_sur['Survived'].iloc[i],cabin_sur['Survived'].iloc[i]], labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
	ax[row,col].set_title(ticket_df['Ticket_first'].iloc[i])

plt.tight_layout()
plt.show()

# Embarked
embarked is a categorical features which shows us the port of embarkation 
C = Cherbourg
Q = Queensrtown
S = Southampton

In [None]:
print(train['Embarked'].value_counts())

# survival rates of embarked
embarked_sur = train['Survived'].groupby(train['Embarked']).mean()
embarked_sur = pd.DataFrame(embarked_sur).reset_index()
embarked_sur

In [None]:
fig, ax = plt.subplots(1,3)

for i in range(len(embarked_sur)):
	col = i%3
	rt = embarked_sur['Survived'].iloc[i]
	ax[col].pie([1 - rt,rt], labels = ['Died','Survived'], autopct = '%1.1f%%', shadow = True, startangle = 90)
	ax[col].set_title(embarked_sur['Embarked'].iloc[i])

plt.tight_layout()
plt.show()

# feature engineering
we have a lot from exploratory data analysis. Now we can start feature engineering. Firstly, lets load the train and test dataset

I have used two types of imputter from sklearn. iterative for age imputation, and simple imputter ( with most frequent strategy ) for embarked

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
display(train.head(3))
display(train.info())

In [None]:
# imputers
imp_embarked = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imp_age = IterativeImputer(max_iter = 100, random_state = 34, n_nearest_features = 2)

# impute embarked
train['Embarked'] = imp_embarked.fit_transform(train[['Embarked']])
test['Embarked'] = imp_embarked.transform(test[['Embarked']])

# impute age
train['Age'] = np.round(imp_age.fit_transform(train[['Age']]))
test['Age'] = np.round(imp_age.fit_transform(test[['Age']]))

In [None]:
display(train.head(3))
display(train.info())


we also encode the sex column

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
le = LabelEncoder()

# Encode sex
train['Sex'] = le.fit_transform(train[['Sex']].values.ravel())
test['Sex'] = le.fit_transform(test[['Sex']].values.ravel())

in EDA, we decided to use family size feature

In [None]:
# family size
train['Fsize'] = train['SibSp'] + train['Parch']
test['Fsize'] = test['SibSp'] + test['Parch']

ticket first letters and cabin first letters are also needed

In [None]:
# ticket first letters
train['Ticket'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket'] = test['Ticket'].apply(lambda x: str(x)[0])

# cabin first letters
train['Cabin'] = train['Cabin'].apply(lambda x: str(x)[0])
test['Cabin'] = test['Cabin'].apply(lambda x: str(x)[0])

now we need some helper functions

In [None]:
def assign_passenger_label(family_size):
	if family_size == 0:
		return "Alone"
	elif family_size <= 3:
		return "Small Family"
	else:
		return "Big_Family"

# group the ticket column
def assign_label_ticket(first):
	if first in ["F",'1','P','9']:
		return "Ticket_high"
	elif first in ['S','C','2']:
		return "Ticket_medium"
	else:
		return "Ticket_low"

# Group the Cabin column  
def assign_label_cabin(cabin):
	if cabin in ["D", "E", "B", "F", "C"]:
		return "Cabin_high"
	elif cabin in ["G", "A"]:
		return "Cabin_middle"
	else:
		return "Cabin_low"

apply the functions

In [None]:
train.head()

In [None]:
# family size
train['Fsize'] = train['Fsize'].apply(assign_passenger_label)
test['Fsize'] = test['Fsize'].apply(assign_passenger_label)

# ticket
train['Ticket'] = train['Ticket'].apply(assign_label_ticket)
test['Ticket'] = test['Ticket'].apply(assign_label_ticket)

# cabin
train['Cabin'] = train['Cabin'].apply(assign_label_cabin)
test['Cabin'] = test['Cabin'].apply(assign_label_cabin)

its time to use one hot encoding

In [None]:
train = pd.get_dummies(columns = ['Pclass','Embarked','Ticket','Cabin','Title_category','Fsize'], data = train, drop_first = True)
test = pd.get_dummies(columns = ['Pclass','Embarked','Ticket','Cabin','Title_category','Fsize'], data = test, drop_first = True)

In [None]:
# drop the unecessry columns
target = train["Survived"]
train.drop(["Survived", "SibSp", "Parch", "Name", "PassengerId",'Title','Ticket_first','Cabin_first','Family_size'], axis=1, inplace=True)
test.drop(["SibSp", "Parch", "Name","PassengerId",'Title',], axis=1, inplace=True)

In [None]:
display(train.head())
display(test.head())

#machine learning

In [None]:
from sklearn.model_selection import train_test_split

# Select the features and the target
X = train.values
y = target.values

# Split the data info training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34, stratify=y)

In [None]:
# Import Necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Initialize a RandomForestClassifier
#rf = RandomForestClassifier(random_state=34)

#params = {'n_estimators': [50, 100, 200, 300, 350],
#          'max_depth': [3,4,5,7, 10,15,20],
#          'criterion':['entropy', 'gini'],
#          'min_samples_leaf' : [1, 2, 3, 4, 5, 10],
#          'max_features':['sqrt'],
#          'min_samples_split': [3, 5, 10, 15, 20],
#          'max_leaf_nodes':[2,3,4,5],
#          }

#clf = GridSearchCV(estimator=rf,param_grid=params,cv=10, n_jobs=-1)

#clf.fit(X_train, y_train.ravel())

#print(clf.best_estimator_)
#print(clf.best_score_)

#rf_best = clf.best_estimator_

# Predict from the test set
#y_pred = clf.predict(X_test)

# Print the accuracy with accuracy_score function
#print("Accuracy: ", accuracy_score(y_test, y_pred))

# Print the confusion matrix
#print("\nConfusion Matrix\n")
#print(confusion_matrix(y_test, y_pred))

In [None]:
# Create a pandas series with feature importances
#importance = pd.Series(rf_best.feature_importances_,index=train.columns).sort_values(ascending=False)

#sns.barplot(x=importance, y=importance.index)
# Add labels to your graph
#plt.xlabel('Importance')
#plt.ylabel('Feature')
#plt.title("Important Features")
#plt.show()
# Display in jupyter notebook
from IPython.display import Image
Image("fimp.png")

In [None]:
last_clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=5, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=350,
                       n_jobs=None, oob_score=True, random_state=34, verbose=0,
                       warm_start=False)

last_clf.fit(train, target)
print("%.4f" % last_clf.oob_score_)

In [None]:
# Store passenger ids
ids = pd.read_csv("../data/test.csv")[["PassengerId"]].values






In [None]:
# Put the mean into the missing value
test['Fare'].fillna(train['Fare'].mean(), inplace = True)

In [None]:
# Make predictions
predictions = last_clf.predict(test.values)

In [None]:
# Create a dictionary with passenger ids and predictions
df = {'PassengerId': ids.ravel(), 'Survived':predictions}

# Create a DataFrame named submission
submission = pd.DataFrame(df)


# Save the file
submission.to_csv("submission_last.csv", index=False)

In [None]:
# Extract single tree
estimator = last_clf.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

In [None]:


Image(filename = 'tree.png')