In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline

In [13]:
# load preprocessed train dataset
df_train = pd.read_csv('titanic_train_preprocessed4.csv')

# create matrix of the features (X) and target (y)
X = df_train.drop('Survived',axis=1) # values used for prediction
y = df_train['Survived'] # values to be predicted

# split data from "titanic_train_preprocessed.csv" to training data (75%) and testing data (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# Create logistic reggresion classifier
lr = LogisticRegression(max_iter=1000, random_state=42)

# fit classifier to the training data
lr.fit(X_train.values, y_train)

In [15]:
# Make predictions on the test set
y_pred = lr.predict(X_test.values)

In [16]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8071748878923767


In [17]:
# once happy with the model accuracy, fit it on all data
lr.fit(X.values,y)

In [18]:
X_pred = pd.read_csv('titanic_test_preprocessed4.csv')
X_pred.replace([np.inf, -np.inf], np.nan, inplace=True) # becaouse of error NaN/inf
X_pred.fillna(999, inplace=True) # becaouse of error NaN/inf
X_pred.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Deck,Relatives,Age_Class,FarePerPerson,Female,Male,...,EmbarkC,EmbarkQ,EmbarkS,adult,baby,child,early adult,late adult,senior,teenager
0,0.334993,-0.49947,-0.400248,-0.498407,-0.476399,-0.553443,1.369105,-0.393918,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,2.05548,-0.17192,-0.156556,-0.705838,-0.329778,-0.54971,-0.320784
1,1.32553,0.616992,-0.400248,-0.513274,-0.476399,0.105643,2.665836,-0.515658,1.322876,-1.322876,...,-0.568142,-0.351647,0.74037,-0.486504,-0.17192,-0.156556,-0.705838,3.032346,-0.54971,-0.320784
2,2.514175,-0.49947,-0.400248,-0.465088,-0.476399,-0.553443,2.077984,-0.341661,-0.755929,0.755929,...,-0.568142,2.843757,-1.350676,-0.486504,-0.17192,-0.156556,-0.705838,-0.329778,1.819142,-0.320784
3,-0.25933,-0.49947,-0.400248,-0.483466,-0.476399,-0.553443,0.591066,-0.370485,-0.755929,0.755929,...,-0.568142,-0.351647,0.74037,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.54971,-0.320784
4,-0.655545,0.616992,0.619896,-0.418471,-0.476399,0.764728,0.072374,-0.498903,1.322876,-1.322876,...,-0.568142,-0.351647,0.74037,-0.486504,-0.17192,-0.156556,1.416755,-0.329778,-0.54971,-0.320784


In [19]:
X_pred_convert = np.nan_to_num(X_pred) # conversion, otherwise y_pred throws an error

In [20]:
# and do prediction with all data available
y_pred = lr.predict(X_pred_convert)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [21]:
# convert numpy ndarray back to dataframe
predicted_data = pd.DataFrame(y_pred)

# append the predicted data to final file
df_test = pd.read_csv('titanic_test_data.csv')
df_test = pd.concat([df_test,predicted_data],axis=1) # append predicted y
df_test = df_test.rename(columns = {0:'Survived'}) # rename added column
df_test = df_test[['PassengerId', 'Survived']] # leave only requested columns for submit
df_test # check if correct

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [22]:
# save result
df_test.to_csv('submission.csv', index=False)