In [187]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve

In [188]:
from sklearn.metrics import accuracy_score, precision_score

In [189]:
train_data=pd.read_csv("../data/titanic/new_train_data.csv")
test_data=pd.read_csv("../data/titanic/new_test_data.csv")
test_ids = test_data['PassengerId']
test_data=test_data.drop('PassengerId',axis=1)

In [190]:
data=pd.read_csv("../data/titanic/test.csv")

In [191]:
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']
X_test = test_data
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [192]:
print("NaN test_data:", test_data.isna().sum().sum())
print("Columns:")
print(test_data.isna().sum()[test_data.isna().sum() > 0])

NaN test_data: 1
Columns:
Fare    1
dtype: int64


In [193]:
print("NaN train_data:", train_data.isna().sum().sum())
print("Columns:")
print(train_data.isna().sum()[train_data.isna().sum() > 0])

NaN train_data: 0
Columns:
Series([], dtype: int64)


In [194]:
fare_median = train_data['Fare'].median()
test_data['Fare'].fillna(fare_median, inplace=True)
print(test_data['Fare'].isnull().sum()) 

0


### Logistic Regression

model

In [195]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X, y)
y_pred_log = log_reg.predict(X_test)

In [196]:
C_values = np.logspace(-3, 3, 10)

In [197]:
from sklearn.model_selection import GridSearchCV

In [198]:
param_grid_log = {
    'C': np.arange(0,1000,10),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

In [199]:
grid_log = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid_log,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

In [None]:
grid_log.fit(X, y)
print("Best params for LogisticRegression:", grid_log.best_params_)
print("Best accuracy: {:.4f}".format(grid_log.best_score_))

In [None]:
best_log = grid_log.best_estimator_
y_pred_log_best = best_log.predict(X_test)

Regularization works to prevent overfitting. If the model remembers the noise and adjusts, then the regularization introduces an additional coefficient in L, depending on the weights

C is inversely proportional to the regularization coefficient, then the higher C, the less regularization, the more the model is allowed to adjust to noise and the smaller the fine

In [None]:
# log_reg = LogisticRegression(max_iter=1000, random_state=42, C=1e3)
# log_reg.fit(X, y)
# y_pred_log = log_reg.predict(X_test)

But you should not expect good results from logistic regression until the data is brought to a normal distribution

metrics

In [None]:
# acc = accuracy_score(y, y_pred_log)
# prec = precision_score(y, y_pred_log)

### RandomForest

model

In [None]:
# rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
# rf.fit(X, y)
# y_pred_rf = rf.predict(X_test)

In [None]:
param_grid_rf = {
    'n_estimators': np.arange(0,1000,10),
    'max_depth': np.arange(0,1000,10),
    'min_samples_split': np.arange(0,1000,10),
    'min_samples_leaf': np.arange(0,1000,10)
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_rf.fit(X, y)

print("Best params for RandomForest:", grid_rf.best_params_)
print("Best accuracy: {:.4f}".format(grid_rf.best_score_))

best_rf = grid_rf.best_estimator_
y_pred_rf_best = best_rf.predict(X_test)

metrics

In [None]:
# acc = accuracy_score(y, y_pred_rf)
# prec = precision_score(y, y_pred_rf)

### Predictions

In [None]:
data=pd.read_csv("../data/titanic/test.csv")
passenger_ids = data['PassengerId']
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred_rf_best
})
submission.to_csv('../data/titanic/submission_rf.csv', index=False)

In [None]:
data=pd.read_csv("../data/titanic/test.csv")
passenger_ids = data['PassengerId']
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred_log_best
})
submission.to_csv('../data/titanic/submission_log.csv', index=False)