In [144]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
import matplotlib.pyplot as plt

In [168]:
data = pd.read_csv("data/data.csv")

In [169]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [170]:
def drop_features(df: pd.DataFrame):
    return df.drop(columns=["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"])

In [171]:
data = drop_features(data)

In [192]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,14.526497,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.125,7.9104
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.0
max,1.0,3.0,80.0,512.3292


In [191]:
data[["Pclass", "Fare"]].corr()

Unnamed: 0,Pclass,Fare
Pclass,1.0,-0.5495
Fare,-0.5495,1.0


In [173]:
X = data.drop("Survived", axis=1)
y = data["Survived"].to_numpy()

In [174]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,  # This ensures balanced splits
    shuffle=True,
    random_state=42
)

In [175]:
X_train

Unnamed: 0,Pclass,Sex,Age,Fare
692,3,male,,56.4958
481,2,male,,0.0000
527,1,male,,221.7792
855,3,female,18.0,9.3500
801,2,female,31.0,26.2500
...,...,...,...,...
359,3,female,,7.8792
258,1,female,35.0,512.3292
736,3,female,48.0,34.3750
462,1,male,47.0,38.5000


In [176]:
class FeatureImputer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["Age"] = X["Age"].fillna(X["Age"].mean())
        X["Fare"] = X["Fare"].fillna(X["Fare"].mean())
        return X

In [177]:
class FeatureEncoder(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[["Sex"]]).toarray()
        column_names = ["Male", "Female"]

        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]
        return X

In [178]:
class FeatureDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop("Sex", axis=1)

In [179]:
pipeline = Pipeline([
    ("Feature Imputer", FeatureImputer()),
    ("Feature Encoder", FeatureEncoder()),
    ("Feature Dropper", FeatureDropper()),
    ("Scaler", StandardScaler()),
    ("Classifier", LogisticRegression(solver="liblinear")),
])

In [180]:
param_grid = {
    "Classifier__C": [0.1, 1.0, 10],
    "Classifier__penalty": ["l1", "l2"],
    "Scaler__with_mean": [True, False],
}

cv_pipeline = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

In [181]:
model = cv_pipeline.fit(X_train, y_train)

In [182]:
model.score(X_test, y_test)

0.7932960893854749

In [183]:
test_data = pd.read_csv("data/test.csv")

In [185]:
test_data = drop_features(test_data)

In [186]:
predictions = model.predict(test_data)

In [187]:
final_df = pd.DataFrame(test_data["PassengerId"])
final_df["Survived"] = predictions

KeyError: 'PassengerId'

In [106]:
final_df.to_csv("data/predictions.csv", index=False)

In [None]:
import pickle

with open("model_pipeline.pkl", "wb") as f:
    