In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
! unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Data

In [94]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [96]:
y_train = df_train['Survived']
X_train, X_test = df_train.drop(columns=['Ticket', 'Survived', 'PassengerId', 'Name']), df_test.drop(columns=['Ticket', 'Name', 'PassengerId'])

In [97]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [98]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0,,S
2,2,male,62.0,0,0,9.6875,,Q
3,3,male,27.0,0,0,8.6625,,S
4,3,female,22.0,1,1,12.2875,,S


In [99]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [105]:
cat_ohe_features = []
cat_ord_features = ['Sex', 'Embarked', 'Pclass', 'Cabin', 'Parch', 'SibSp']
numeric_features = ['Age', 'Fare']

In [106]:
ord = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-2))])
ohe = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(sparse_output=False))])
sc = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', ord, cat_ord_features),
        ('ohe', ohe, cat_ohe_features),
        ('scaler', sc, numeric_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classification', LogisticRegression(max_iter=500))
])


In [107]:
pipeline.fit(X_train, y_train)

In [108]:
accuracy = pipeline.score(X_val, y_val)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.83


In [109]:
y_pred = pipeline.predict(X_test)

In [111]:
y_pred_df = pd.DataFrame(y_pred, columns=['Survived'])

In [114]:
y_pred_df['PassengerId'] = np.arange(892, 1310)

In [115]:
y_pred_df

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,1,896
...,...,...
413,0,1305
414,1,1306
415,0,1307
416,0,1308


In [117]:
y_pred_df.to_csv('preds.csv', index=False)

In [122]:
from sklearn.metrics import recall_score, precision_score

In [121]:
recall_score(y_val, y_)

0.7424242424242424

In [120]:
y_ = pipeline.predict(X_val)

In [124]:
precision_score(y_val, y_)

0.7903225806451613