In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("train.csv")

In [3]:
# df.info()
df.head()
df["SibSp"].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [None]:
df.isnull().mean()*100

In [5]:
num_features = ["Age","Fare"]
cat_features = ["Pclass","Sex","SibSp","Parch","Embarked"]

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [10]:
X = df.drop(columns=["PassengerId","Name","Ticket","Cabin","Survived"])
y = df["Survived"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

In [13]:
X_train_numpy = X_train.to_numpy()
y_train_numpy = y_train.to_numpy()
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(pipeline, X, y, cv=kfold)

In [16]:
print(f'Cross-Validation Accuracy Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Accuracy: {cross_val_scores.mean()}')

Cross-Validation Accuracy Scores: [0.84444444 0.76404494 0.82022472 0.7752809  0.84269663 0.85393258
 0.78651685 0.75280899 0.75280899 0.88764045]
Mean Cross-Validation Accuracy: 0.8080399500624219


In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [22]:
y_pred = pipeline.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [23]:
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.7761194029850746
Recall: 0.7027027027027027
F1 Score: 0.7375886524822697
