Titanic dataset is one of the datasets available in sklearn.

You are given:
1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

Your task is to:
1. Use Logistic Regression and come with predictions.

In [99]:
import numpy as np
import pandas as pd

In [100]:
# load training data
train_data = pd.read_csv('training_titanic_x_y_train.csv')
train_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [101]:
# split data into features and response
X_train = train_data.iloc[:, :-1]
Y_train = train_data.iloc[:, -1]
Y_train = np.array(Y_train)

In [102]:
# check null values
X_train.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
dtype: int64

In [103]:
def del_col_fill_val(X_data):
    # delete columns
    del X_data['Name']
    del X_data['Ticket']
    del X_data['Cabin']
    
    # filling values
    common_value = 'S'
    data = [X_data]

    for dataset in data:
        dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    return X_data

In [104]:
x_train = del_col_fill_val(X_train)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,female,29.0,1,0,26.0,S
1,3,male,,0,0,8.05,S
2,2,male,39.0,0,0,26.0,S
3,3,female,29.0,0,4,21.075,S
4,3,male,25.0,0,0,7.05,S


In [105]:
x_train['Embarked'].value_counts()

S    485
C    133
Q     50
Name: Embarked, dtype: int64

In [106]:
# check Age column null values
x_train['Age'].isnull().sum()

132

In [107]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [108]:
from sklearn.compose import ColumnTransformer

In [109]:
transformer = ColumnTransformer(transformers = [
    ('imputer', SimpleImputer(), ['Age']),
    ('ode', OrdinalEncoder(categories = [['S', 'C', 'Q']]), ['Embarked']),
    ('ohe', OneHotEncoder(sparse = False, drop = 'first'), ['Sex'])
], remainder = 'passthrough')


In [110]:
training_data = transformer.fit_transform(x_train)
training_data.shape

(668, 7)

In [125]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()

In [126]:
clf.fit(training_data, Y_train)

GradientBoostingClassifier()

In [127]:
clf.score(training_data, Y_train)

0.9131736526946108

### load test data

In [128]:
testing = pd.read_csv('test_titanic_x_test.csv')
testing.tail()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
218,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.925,,S
219,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.55,B38,S
220,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C
221,3,"Holm, Mr. John Fredrik Alexander",male,43.0,0,0,C 7075,6.45,,S
222,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,S


In [129]:
testing_data = del_col_fill_val(testing)
testing_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,2,male,8.0,1,1,36.75,S
1,1,female,49.0,0,0,25.9292,S
2,3,male,,0,0,7.7375,Q
3,2,female,24.0,2,1,27.0,S
4,1,male,36.0,0,0,26.2875,S


In [130]:
test_data = transformer.transform(testing)
test_data.shape

(223, 7)

In [131]:
y_pred = clf.predict(test_data)

In [132]:
y_pred

array([1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0], dtype=int64)

In [133]:
clf.score(test_data, y_pred)

1.0

In [134]:
# create CSV file
np.savetxt('Prediction_Titanic_dataset.csv', y_pred, delimiter = ',', fmt = '%.0f')