In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd


df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

test_ids = df_test['PassengerId']

merged = pd.concat([df_train.drop('Survived', axis=1), df_test], ignore_index=True, sort=False)


merged['FamCount'] = merged['SibSp'] + merged['Parch'] + 1
merged['Fare'] = merged['Fare'].fillna(merged['Fare'].median())
merged['Embarked'] = merged['Embarked'].fillna(merged['Embarked'].mode()[0])
merged['Age'] = merged['Age'].fillna(merged['Age'].median())
merged = merged.drop(['Name', 'Ticket', 'Cabin'], axis=1)


X_data = merged.iloc[:len(df_train)].copy()
X_submit = merged.iloc[len(df_train):].copy()
y_target = df_train['Survived']

X_data = X_data.drop(['PassengerId'], axis=1)
X_submit_final = X_submit.drop(['PassengerId'], axis=1)


num_cols = ['Age', 'Fare', 'FamCount']
cat_cols = ['Sex', 'Embarked', 'Pclass']

num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

processor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

rf_clf = RandomForestClassifier(
    n_estimators=150,
    max_depth=6,
    random_state=42
)

ml_pipe = Pipeline([
    ('prep', processor),
    ('rf', rf_clf)
])


ml_pipe.fit(X_data, y_target)


train_preds = ml_pipe.predict(X_data)
print("Training Accuracy:", accuracy_score(y_target, train_preds))
print("Confusion Matrix:\n", confusion_matrix(y_target, train_preds))
print("F1 Score:", f1_score(y_target, train_preds))


pred_submit = ml_pipe.predict(X_submit_final)
submission_alt = pd.DataFrame({'PassengerId': test_ids, 'Survived': pred_submit})
submission_alt.to_csv('submission_ml_alt.csv', index=False)
print("\n--- Submission File ---")
print("'submission_ml_alt.csv' was successfully saved!")


Training Accuracy: 0.8731762065095399
Confusion Matrix:
 [[526  23]
 [ 90 252]]
F1 Score: 0.8168557536466775

--- Submission File ---
'submission_ml_alt.csv' was successfully saved!
