In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from utilities import set_multiple_columns_datatype
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

In [None]:
#Import data
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [None]:
columns = {"Pclass":'category', 'Embarked':'category', "Sex":'category'}
train = set_multiple_columns_datatype(train, columns)

In [None]:
#Inspect
train.info()

In [None]:
train.describe()

In [None]:
train_dummies = pd.get_dummies(train.drop(['Cabin', 'Name', 'Ticket'], axis=1))

In [None]:
X = train_dummies.drop(['Survived', 'PassengerId'], axis=1)
y = train['Survived']

X_train_dummies, X_test_dummies, y_train_dummies, y_test_dummies = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(max_depth=6, min_samples_split=5, n_estimators=335,random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(),
    'Linear Regression': LinearRegression()
}

pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

In [None]:
results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train_dummies, y_train_dummies, cv=5, scoring='accuracy')
    results[name] = scores
    print(f'{name}: {scores.mean():.2f} ± {scores.std():.2f}')


In [None]:
for name, pipeline in pipelines.items():
    pipeline.fit(X_train_dummies, y_train_dummies)
    y_pred = pipeline.predict(X_test_dummies)
    accuracy = accuracy_score(y_test_dummies, y_pred)
    precision = precision_score(y_test_dummies, y_pred)
    recall = recall_score(y_test_dummies, y_pred)
    f1 = f1_score(y_test_dummies, y_pred)
    print(f'\n{name} Performance on Test Set:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')


In [None]:
test_new = set_multiple_columns_datatype(test, columns)
X_true_test = pd.get_dummies(test_new.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1))
y_true_pred = pipelines['Random Forest'].predict(X_true_test)
y_true_pred

In [None]:
test['Survived'] = y_true_pred
test


In [None]:
results = pd.DataFrame()
results['Survived'] = test['Survived']
results['PassengerId'] = test['PassengerId']
results.to_csv('data/results.csv', index=False)

In [None]:
sol = pd.read_csv('data/results.csv')
sol['PassengerId'].unique()