In [1]:
import functions as f
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

## Import data and divide train/test groups

In [6]:
df, X, y = f.import_train_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.20)

## Scale data

In [7]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [8]:
logreg = LogisticRegression()

In [9]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
logreg.score(X_test, y_test)

0.7972027972027972

In [11]:
logreg_scaled = LogisticRegression()
logreg_scaled.fit(X_train_scaled, y_train)

In [12]:
logreg_scaled.score(X_test_scaled, y_test)

0.7972027972027972

## Test

In [13]:
test = f.import_clean_test()

In [10]:
f.generate_results(logreg, test, "logistic_regression.csv")

In [12]:
f.generate_scaled_results(logreg_scaled, scaler, test, "logistic_regression_scaled.csv")

## Find, train, and use better Logistic Regression model

In [14]:
grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga']
}
]

In [15]:
logreg_better = LogisticRegression()

In [16]:
model = GridSearchCV(estimator = logreg_better, param_grid = grid, cv=5)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
logreg_better = LogisticRegression(C=0.23357214690901212, solver='newton-cg')

In [21]:
logreg_better.fit(X_train, y_train)

In [22]:
logreg_better.score(X_test, y_test)

0.8111888111888111

In [23]:
f.generate_results(logreg, test, "logistic_regression_better.csv")

## Use model with training data including cabin info

In [2]:
df, X, y = f.import_train_data_withcabins()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.20)

In [3]:
grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga']
}
]

In [4]:
logreg_check = LogisticRegression()

In [6]:
model = GridSearchCV(estimator = logreg_check, param_grid = grid, cv=5)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [7]:
logreg_better_withcabins = LogisticRegression(C=0.23357214690901212, solver='newton-cg')

In [8]:
logreg_better_withcabins.fit(X_train, y_train)

In [9]:
logreg_better_withcabins.score(X_test, y_test)

0.8111888111888111

In [10]:
test_withcabins = f.import_clean_test_withcabins()

In [11]:
f.generate_results(logreg_better_withcabins, test_withcabins, 'logistic_regression_better_result_withcabins.csv')