In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
#train data
data_train = pd.read_csv("train.csv")
data_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
#y_train
y_train = data_train["Survived"].to_numpy()
y_train[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [4]:
#Function to prepare data
def prepare(X):
    print(f"NaN values before: {X.isnull().sum().sum()}")
    
    #1 Deck
    deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    X['Cabin'] = X['Cabin'].fillna("U0")
    X['Deck'] = X['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    X['Deck'] = X['Deck'].map(deck)
    X['Deck'] = X['Deck'].fillna(0)
    X['Deck'] = X['Deck'].astype(int)
    
    #2 Age
    mean = X["Age"].mean()
    std = X["Age"].std()
    is_null = X["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    age_slice = X["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    X["Age"] = age_slice
    X.loc[X['Age'] <= 11, 'Age'] = 0
    X.loc[(X['Age'] > 11) & (X['Age'] <= 18), 'Age'] = 1
    X.loc[(X['Age'] > 18) & (X['Age'] <= 22), 'Age'] = 2
    X.loc[(X['Age'] > 22) & (X['Age'] <= 27), 'Age'] = 3
    X.loc[(X['Age'] > 27) & (X['Age'] <= 33), 'Age'] = 4
    X.loc[(X['Age'] > 33) & (X['Age'] <= 40), 'Age'] = 5
    X.loc[(X['Age'] > 40) & (X['Age'] <= 66), 'Age'] = 6
    X.loc[X['Age'] > 66, 'Age'] = 6
    X["Age"] = X["Age"].astype(int)
    
    #3 Embarked
    common_value = 'S'
    X['Embarked'] = X['Embarked'].fillna(common_value)
    ports = {"S": 0, "C": 1, "Q": 2}
    X['Embarked'] = X['Embarked'].map(ports)
    
    #4 Fare
    X['Fare'] = X['Fare'].fillna(0) 
    X.loc[X['Fare'] <= 7.91, 'Fare'] = 0
    X.loc[(X['Fare'] > 7.91) & (X['Fare'] <= 14.454), 'Fare'] = 1
    X.loc[(X['Fare'] > 14.454) & (X['Fare'] <= 31), 'Fare']   = 2
    X.loc[(X['Fare'] > 31) & (X['Fare'] <= 99), 'Fare']   = 3
    X.loc[(X['Fare'] > 99) & (X['Fare'] <= 250), 'Fare']   = 4
    X.loc[X['Fare'] > 250, 'Fare'] = 5
    X['Fare'] = X['Fare'].astype(int)
    
    #5 Title
    titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    X['Title'] = X.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    X['Title'] = X['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                        'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    X['Title'] = X['Title'].replace('Mlle', 'Miss')
    X['Title'] = X['Title'].replace('Ms', 'Miss')
    X['Title'] = X['Title'].replace('Mme', 'Mrs')
    X['Title'] = X['Title'].map(titles)
    X['Title'] = X['Title'].fillna(0)
    
    #6 Sex
    genders = {"male": 0, "female": 1}
    X['Sex'] = X['Sex'].map(genders)
    X['Sex'] = X['Sex'].astype(int)
    
    #7 Family
    X['Family'] = X.Parch + X.SibSp + 1
    
    #8 Is alone
    X['isAlone'] = X['Family'] == 1
    X['isAlone'] = X['isAlone'].astype(int)
    
    #9 Age class
    #X['Age_Class']= X['Age'] * X['Pclass']
    
    X = X.drop(['PassengerId','Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    
    print(f"NaN values after: {X.isnull().sum().sum()}")
    return X

In [5]:
#X_train
X_train = data_train
X_train = prepare(X_train)
X_train = X_train.drop(['Survived'], axis = 1)
print(X_train.dtypes)
X_train

NaN values before: 866
NaN values after: 0
Pclass      int64
Sex         int32
Age         int32
Fare        int32
Embarked    int64
Deck        int32
Title       int64
Family      int64
isAlone     int32
dtype: object


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,Title,Family,isAlone
0,3,0,2,0,0,8,1,2,0
1,1,1,5,3,1,3,3,2,0
2,3,1,3,1,0,8,2,1,1
3,1,1,5,3,0,3,3,2,0
4,3,0,5,1,0,8,1,1,1
...,...,...,...,...,...,...,...,...,...
886,2,0,3,1,0,8,5,1,1
887,1,1,2,2,0,2,2,1,1
888,3,1,3,2,0,8,2,4,0
889,1,0,3,2,1,3,1,1,1


In [6]:
#X_train to numpy array
X_train = X_train.to_numpy()
print(f"X_train.shape = {X_train.shape}")

X_train.shape = (891, 9)


In [7]:
#Test data
data_test = pd.read_csv("test.csv")
data_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [8]:
#saving passenger Ids
passengerId = data_test["PassengerId"].to_numpy()
print(f"passengerId.shape = {passengerId.shape}")
passengerId[:10]

passengerId.shape = (418,)


array([892, 893, 894, 895, 896, 897, 898, 899, 900, 901], dtype=int64)

In [9]:
#X_test
X_test = data_test
X_test = prepare(X_test)
print(X_test.dtypes)
X_test

NaN values before: 414
NaN values after: 0
Pclass      int64
Sex         int32
Age         int32
Fare        int32
Embarked    int64
Deck        int32
Title       int64
Family      int64
isAlone     int32
dtype: object


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,Title,Family,isAlone
0,3,0,5,0,2,8,1,1,1
1,3,1,6,0,0,8,3,2,0
2,2,0,6,1,2,8,1,1,1
3,3,0,3,1,0,8,1,1,1
4,3,1,2,1,0,8,3,3,0
...,...,...,...,...,...,...,...,...,...
413,3,0,2,1,0,8,1,1,1
414,1,1,5,4,1,3,5,1,1
415,3,0,5,0,0,8,1,1,1
416,3,0,6,1,0,8,1,1,1


In [10]:
#X_test to numpy array
X_test = X_test.to_numpy()
print(f"X_train.shape = {X_test.shape}")

X_train.shape = (418, 9)


In [11]:
#Classification
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(X_train, y_train)
print("Train accuracy= {:.3%}".format(clf.score (X_train, y_train)))

Train accuracy= 83.838%


In [12]:
#Cross validation
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.83333333 0.8988764  0.76404494 0.86516854 0.84269663 0.80898876
 0.83146067 0.78651685 0.86516854 0.85393258]
Mean: 0.8350187265917602
Standard Deviation: 0.03794111012337763


In [13]:
#Prediction
y_train_scores = clf.predict(X_train)
r_a_score = roc_auc_score(y_train, y_train_scores)
print("ROC-AUC-Score:", r_a_score)

ROC-AUC-Score: 0.8219969322212636


In [14]:
#Prediction
y_predicted = clf.predict(X_test)
y_predicted[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [15]:
res_array = np.column_stack((passengerId, y_predicted))
result = pd.DataFrame(res_array, columns = ["PassengerId", "Survived"])
result.to_csv("submission.csv", index=False)
display(result)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
