In [88]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from utilities import set_multiple_columns_datatype
from sklearn.svm import SVC

In [89]:
#Import data
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [90]:
columns = {"Pclass":'category', 'Embarked':'category', "Sex":'category'}
train = set_multiple_columns_datatype(train, columns)

In [91]:
#Inspect
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(3), float64(2), int64(4), object(3)
memory usage: 65.8+ KB


In [92]:
train.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,80.0,8.0,6.0,512.3292


In [93]:
train_dummies = pd.get_dummies(train.drop(['Cabin', 'Name', 'Ticket'], axis=1))

In [94]:
X = train_dummies.drop(['Survived'], axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC()
}

pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

In [96]:
results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    results[name] = scores
    print(f'{name}: {scores.mean():.2f} ± {scores.std():.2f}')


Logistic Regression: 0.80 ± 0.03
Random Forest: 0.80 ± 0.01
Gradient Boosting: 0.82 ± 0.02
SVM: 0.82 ± 0.03


In [97]:
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'\n{name} Performance on Test Set:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')



Logistic Regression Performance on Test Set:
Accuracy: 0.79
Precision: 0.77
Recall: 0.72
F1 Score: 0.74

Random Forest Performance on Test Set:
Accuracy: 0.84
Precision: 0.84
Recall: 0.76
F1 Score: 0.79

Gradient Boosting Performance on Test Set:
Accuracy: 0.82
Precision: 0.82
Recall: 0.72
F1 Score: 0.76

SVM Performance on Test Set:
Accuracy: 0.81
Precision: 0.84
Recall: 0.66
F1 Score: 0.74
