In [2]:
#import all libraries and classes
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [3]:
#import the dataset
titanic = pd.read_csv(r"C:\Users\Williams\Desktop\DataScienceWithAiLead\data\titanic.csv")
titanic_new = titanic.copy()

In [4]:
#data cleaning:

#keep rows with 70% of columns filled
titanic_new = titanic_new.dropna(thresh=(len(titanic_new.columns)*0.7))

#fill the nulls of "fare" column with the most occuring value
titanic_new["fare"] = titanic_new["fare"].fillna(value=titanic_new["fare"].mode()[0])

#forward fill the nulls of "embarked" column
titanic_new["embarked"] = titanic_new["embarked"].fillna(method="ffill")

#randomly fill the nulls of the rest of the columns
titanic_new["age"] = titanic_new["age"].fillna(value=titanic_new["age"].dropna().sample().to_string(index=False))
titanic_new["age"] = pd.to_numeric(titanic_new["age"])
titanic_new["cabin"] = titanic_new["cabin"].fillna(value=titanic_new["cabin"].dropna().sample().to_string(index=False))
titanic_new["boat"] = titanic_new["boat"].fillna(value=titanic_new["boat"].dropna().sample().to_string(index=False))
titanic_new["body"] = titanic_new["body"].fillna(value=titanic_new["body"].dropna().sample().to_string(index=False))
titanic_new["home.dest"] = titanic_new["home.dest"].fillna(value=titanic_new["home.dest"].dropna().sample().to_string(index=False))

In [6]:
#encode categorical features
sex = pd.get_dummies(titanic_new["sex"], drop_first=True)
embarked = pd.get_dummies(titanic_new["embarked"], drop_first=True)
titanic_new = pd.concat([titanic_new, sex, embarked], axis=1)

In [7]:
#select features and target
X = titanic_new.drop(["survived", "name", "sex", "ticket", "cabin", "embarked", "boat", "home.dest"], axis=1)
y = titanic_new["survived"]
titanic_new["pclass"] = pd.to_numeric(titanic_new["pclass"], downcast="float")
titanic_new["age"] = pd.to_numeric(titanic_new["age"], downcast="float")
titanic_new["sibsp"] = pd.to_numeric(titanic_new["sibsp"], downcast="float")
titanic_new["parch"] = pd.to_numeric(titanic_new["parch"], downcast="float")
titanic_new["fare"] = pd.to_numeric(titanic_new["fare"], downcast="float")
titanic_new["body"] = pd.to_numeric(titanic_new["body"], downcast="float")

In [8]:
#split the data
X_train_test, X_validate, y_train_test, y_validate = train_test_split(X, y, train_size=0.9, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_train_test, y_train_test, random_state=0)

In [9]:
#scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_validate_scaled = scaler.transform(X_validate)

In [10]:
#build the model
logreg1 = LogisticRegression()
logreg1.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
#check the performance
print("Train Accuracy: \t{:.4f}".format(logreg1.score(X_train_scaled, y_train)))
print("Test Accuracy: \t\t{:.4f}".format(logreg1.score(X_test_scaled, y_test)))
print("Validation Accuracy: \t{:.4f}".format(logreg1.score(X_validate_scaled, y_validate)))

Train Accuracy: 	0.7786
Test Accuracy: 		0.8038
Validation Accuracy: 	0.7414


In [12]:
#there is no overfitting