In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('dataset/dataset.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
numerical_values = ['Age', 'Fare']
df_numerical = df[numerical_values]
df_numerical

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
1304,,8.0500
1305,39.0,108.9000
1306,38.5,7.2500
1307,,8.0500


In [4]:
categorical_values = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Cabin', 'Embarked', 'SibSp', 'Parch']
df_categorical = df[categorical_values]
df_categorical

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked,SibSp,Parch
0,1,0,3,male,,S,1,0
1,2,1,1,female,C85,C,1,0
2,3,1,3,female,,S,0,0
3,4,1,1,female,C123,S,1,0
4,5,0,3,male,,S,0,0
...,...,...,...,...,...,...,...,...
1304,1305,0,3,male,,S,0,0
1305,1306,1,1,female,C105,C,0,0
1306,1307,0,3,male,,S,0,0
1307,1308,0,3,male,,S,0,0


In [5]:
df.isna().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [6]:
df.describe().round(3)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.377,2.295,29.881,0.499,0.385,33.295
std,378.02,0.485,0.838,14.413,1.042,0.866,51.759
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.896
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.454
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.329


In [7]:
df.drop(['Name', 'Ticket', 'Embarked', 'SibSp', 'Parch', 'Fare', 'PassengerId'], axis=1, inplace=True)

In [8]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
scaler = MinMaxScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])

In [9]:
df['Cabin'] = df['Cabin'].apply(lambda x: x if pd.isna(x) else str(x)[0])

df['Cabin'] = df['Cabin'].ffill()
most_common = df['Cabin'].value_counts()
df['Cabin'] = df['Cabin'].fillna(most_common.index[0])

cabin_encoder = OneHotEncoder()
df[cabin_encoder.get_feature_names_out()] = cabin_encoder.fit_transform(df['Cabin'].to_numpy().reshape(-1, 1)).toarray()
df.drop(['Cabin'], axis=1, inplace=True)

In [10]:
sex_encoder = OneHotEncoder()
df[sex_encoder.get_feature_names_out()] = sex_encoder.fit_transform(df['Sex'].to_numpy().reshape(-1, 1)).toarray()
df.drop(['Sex'], axis=1, inplace=True)

In [11]:
pclass_encoder = OneHotEncoder()
df[pclass_encoder.get_feature_names_out()] = pclass_encoder.fit_transform(df['Pclass'].to_numpy().reshape(-1, 1)).toarray()
df.drop(['Pclass'], axis=1, inplace=True)

In [12]:
df

Unnamed: 0,Survived,Age,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x0_T,x0_female,x0_male,x0_1,x0_2,x0_3
0,0,0.273456,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,0.473882,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,0.323563,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,0.436302,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,0.436302,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0,0.372180,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1305,1,0.486409,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1306,0,0.480145,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1307,0,0.372180,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [13]:
x, y = df.drop(['Survived'], axis=1).values, df['Survived'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [14]:
lsvc = LinearSVC()
lsvc.fit(x, y)
score = lsvc.score(x_test, y_test)
matrix = confusion_matrix(y_test, lsvc.predict(x_test))
print(f'Accuracy: {score*100:.2f}%')
print(matrix)

Accuracy: 86.77%
[[208  22]
 [ 30 133]]


In [15]:
dtc = DecisionTreeClassifier()
dtc.fit(x, y)
score = dtc.score(x_test, y_test)
matrix = confusion_matrix(y_test, dtc.predict(x_test))
print(f'Accuracy: {score*100:.2f}%')
print(matrix)

Accuracy: 93.64%
[[225   5]
 [ 20 143]]


In [16]:
rfc = RandomForestClassifier()
rfc.fit(x, y)
score = rfc.score(x_test, y_test)
matrix = confusion_matrix(y_test, rfc.predict(x_test))
print(f'Accuracy: {score*100:.2f}%')
print(matrix)

Accuracy: 94.66%
[[223   7]
 [ 14 149]]
