In [1]:
import pandas as pd
import numpy as np


In [2]:
train_df = pd.read_csv(
    "/Users/rahulanil/garchomp/projects/kaggle/titanic/data/train.csv"
)
test_df = pd.read_csv("/Users/rahulanil/garchomp/projects/kaggle/titanic/data/test.csv")


In [3]:
import titanic_preprocessing as tp

test_df_passengeId = test_df["PassengerId"]
train_df, test_df = tp.generic_perprocessing(train_df, test_df)


In [4]:
X = train_df.loc[:, train_df.columns != "Survived"]
y = train_df["Survived"]


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Survived = y
# OneHotEncoding: Pclass, Sex, Embarked, Initials
# StandardScalar: Age, SibSp, Parch, Fare, Family_size

ct = ColumnTransformer(
    [
        (
            "onehot",
            OneHotEncoder(sparse=False),
            ["Pclass", "Sex", "Embarked", "Initials"],
        ),
        (
            "StandardScaler",
            StandardScaler(),
            ["Age", "SibSp", "Parch", "Fare", "Family_size"],
        ),
    ]
)

ct.fit(X)
X_ct = ct.transform(X)
print(f"X_ct shpae: {X_ct.shape}")

test = ct.transform(test_df)
print(f"X_test shpae: {test.shape}")


X_ct shpae: (889, 21)
X_test shpae: (418, 21)


In [6]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty="none", C=0.01, max_iter=10000)
clf2 = LogisticRegression(penalty="l2", C=0.01, max_iter=10000)
clf.fit(X_ct, y)
print(f"clf score: {clf.score(X_ct, y)}")


clf score: 0.8312710911136107




In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "penalty": ["l2", "none"],
        "C": [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10],
        "max_iter": [10000],
        "n_jobs": [-1],
    },
    {
        "penalty": ["l1"],
        "C": [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10],
        "max_iter": [10000],
        "n_jobs": [-1],
        "solver": ["saga"],
    },
    {
        "penalty": ["elasticnet"],
        "C": [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 10],
        "max_iter": [10000],
        "n_jobs": [-1],
        "solver": ["saga"],
        "l1_ratio": [0.2, 0.4, 0.6, 0.8],
    },
]

grid_search = GridSearchCV(
    LogisticRegression(), param_grid, cv=5, return_train_score=True
)


In [8]:
from sklearn.model_selection import train_test_split
import warnings

X_train, X_test, y_train, y_test = train_test_split(X_ct, y, random_state=0)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_search.fit(X_train, y_train)

print(f"test scores: {grid_search.score(X_test, y_test)}")
print(f"Best parametesrs: {grid_search.best_params_}")
print(f"best cross validation score: {grid_search.best_score_}")


test scores: 0.7713004484304933
Best parametesrs: {'C': 1, 'max_iter': 10000, 'n_jobs': -1, 'penalty': 'l2'}
best cross validation score: 0.8468970934799686


In [9]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
display(grid_search_results)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,param_n_jobs,param_penalty,param_solver,param_l1_ratio,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.942032,1.406792,0.001515,0.002010,0.001,10000,-1,l2,,,...,0.627629,0.005837,56,0.627820,0.628518,0.634146,0.637899,0.628518,0.631380,0.003980
1,0.186262,0.137828,0.000425,0.000030,0.001,10000,-1,none,,,...,0.840893,0.032421,37,0.859023,0.844278,0.849906,0.870544,0.844278,0.853606,0.010040
2,0.004507,0.000304,0.000394,0.000039,0.01,10000,-1,l2,,,...,0.821344,0.029692,52,0.825188,0.827392,0.825516,0.836773,0.810507,0.825075,0.008422
3,0.013684,0.002271,0.000342,0.000028,0.01,10000,-1,none,,,...,0.840893,0.032421,37,0.859023,0.844278,0.849906,0.870544,0.844278,0.853606,0.010040
4,0.061972,0.114324,0.000338,0.000058,0.1,10000,-1,l2,,,...,0.836348,0.035332,46,0.832707,0.831144,0.846154,0.857411,0.825516,0.838586,0.011597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,0.133081,0.010358,0.000366,0.000029,5,10000,-1,elasticnet,saga,0.8,...,0.843890,0.034169,25,0.859023,0.844278,0.851782,0.870544,0.844278,0.853981,0.009929
59,0.111952,0.039497,0.000356,0.000010,10,10000,-1,elasticnet,saga,0.2,...,0.845393,0.032179,5,0.859023,0.844278,0.849906,0.870544,0.844278,0.853606,0.010040
60,0.120149,0.037937,0.000426,0.000152,10,10000,-1,elasticnet,saga,0.4,...,0.845393,0.032179,5,0.859023,0.844278,0.849906,0.870544,0.844278,0.853606,0.010040
61,0.141615,0.035870,0.000372,0.000038,10,10000,-1,elasticnet,saga,0.6,...,0.845393,0.032179,5,0.859023,0.844278,0.849906,0.870544,0.846154,0.853981,0.009714


In [13]:
grid_search_params = grid_search.best_params_
print(grid_search_params)

{'C': 1, 'max_iter': 10000, 'n_jobs': -1, 'penalty': 'l2'}


In [14]:
clf = LogisticRegression()
clf.set_params(**grid_search_params)
clf.fit(X_ct, y)
y_test = pd.Series(clf.predict(test), name="Survived").astype(int)
result = pd.concat([test_df_passengeId, y_test], axis=1)
result.to_csv("LogReg.csv", index=False)
display(result)

ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got {'C': 1, 'max_iter': 10000, 'n_jobs': -1, 'penalty': 'l2'}.