In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
from tqdm import tqdm
from IPython import display
import time

# Load data

In [94]:
data, meta = arff.loadarff(open("datasets/hill_valley.arff"))

In [128]:
df = pd.DataFrame(data)
df = df.applymap(lambda x: int(x.decode('utf-8')) if isinstance(x, bytes) else x)
print(df.shape)

(1212, 101)


In [129]:
df.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V92,V93,V94,V95,V96,V97,V98,V99,V100,Class
0,39.02,36.49,38.2,38.85,39.38,39.74,37.02,39.53,38.81,38.79,...,36.62,36.92,38.8,38.52,38.07,36.73,39.46,37.5,39.1,0
1,1.83,1.71,1.77,1.77,1.68,1.78,1.8,1.7,1.75,1.78,...,1.8,1.79,1.77,1.74,1.74,1.8,1.78,1.75,1.69,1
2,68177.69,66138.42,72981.88,74304.33,67549.66,69367.34,69169.41,73268.61,74465.84,72503.37,...,73438.88,71053.35,71112.62,74916.48,72571.58,66348.97,71063.72,67404.27,74920.24,1


# Preprocessing

### Scale data

In [130]:
from sklearn.preprocessing import StandardScaler

In [131]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df.drop(columns=['Class']))

In [132]:
df[list(df.columns)[:-1]] = scaled
df.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V92,V93,V94,V95,V96,V97,V98,V99,V100,Class
0,-0.452487,-0.453618,-0.451009,-0.452502,-0.45343,-0.453874,-0.452309,-0.451419,-0.451509,-0.451673,...,-0.454998,-0.455696,-0.453324,-0.454918,-0.453962,-0.454009,-0.456096,-0.451643,-0.455455,0
1,-0.454557,-0.455564,-0.453024,-0.454564,-0.455543,-0.455992,-0.454263,-0.453516,-0.453571,-0.453742,...,-0.456958,-0.457675,-0.455391,-0.456983,-0.45598,-0.455955,-0.458218,-0.453623,-0.457554,1
2,3.339835,3.244667,3.583381,3.676918,3.330907,3.414915,3.383665,3.607587,3.689084,3.598118,...,3.676649,3.545037,3.514595,3.749207,3.573661,3.240539,3.542787,3.279074,3.746168,1


# Logistic Regression

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [204]:
X = df.drop(columns=['Class']).values.astype(np.float64)
Y = df['Class'].values.astype(np.float64)

In [102]:
param_grid = [{'solver': ['lbfgs'], 'penalty': ['l2'], 'C': np.logspace(-4, 4, 40), 'max_iter': [10_000]},
              {'solver': ['lbfgs'], 'penalty': [None], 'max_iter': [10_000]},
              {'solver': ['saga'], 'penalty': ['l1', 'elasticnet'], 'C': np.logspace(-4, 4, 40), 'max_iter': [10_000]}]

In [103]:
clf = GridSearchCV(LogisticRegression(), param_grid, n_jobs=1, verbose=10)
clf.fit(X, Y)

Fitting 5 folds for each of 121 candidates, totalling 605 fits
[CV 1/5; 1/121] START C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs........
[CV 1/5; 1/121] END C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.506 total time=   0.0s
[CV 2/5; 1/121] START C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs........
[CV 2/5; 1/121] END C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.506 total time=   0.0s
[CV 3/5; 1/121] START C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs........
[CV 3/5; 1/121] END C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.521 total time=   0.0s
[CV 4/5; 1/121] START C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs........
[CV 4/5; 1/121] END C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.533 total time=   0.0s
[CV 5/5; 1/121] START C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs........
[CV 5/5; 1/121] END C=0.0001, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.492 total time=   0.0s
[CV 1/5; 2/1

[CV 4/5; 3/121] END C=0.00025719138090593444, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.533 total time=   0.0s
[CV 5/5; 3/121] START C=0.00025719138090593444, max_iter=10000, penalty=l2, solver=lbfgs
[CV 5/5; 3/121] END C=0.00025719138090593444, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.492 total time=   0.0s
[CV 1/5; 4/121] START C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs
[CV 1/5; 4/121] END C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.506 total time=   0.0s
[CV 2/5; 4/121] START C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs
[CV 2/5; 4/121] END C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.510 total time=   0.0s
[CV 3/5; 4/121] START C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs
[CV 3/5; 4/121] END C=0.0004124626382901352, max_iter=10000, penalty=l2, solver=lbfgs;, score=0.521 total time=   0.0s
[CV 4/5; 4/121] START C=0.0004124626382901352, max

200 fits failed out of a total of 605.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\trist\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\trist\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\trist\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1178, in fit
    raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
ValueError: l1_ratio must be specified

#### Logistic Regression - CV results

In [140]:
from sklearn.metrics import log_loss

In [141]:
best_log_regr = clf.best_estimator_

In [144]:
y_pred_prob = clf.predict_proba(X)
loss = log_loss(Y, y_pred_prob)

In [148]:
print(f'Fitted Logistic Regression -> Log-Loss = {loss}, Accuracy = {round(100*best_log_regr.score(X, Y), 2)}%')

Fitted Logistic Regression -> Log-Loss = 0.09541390991649372, Accuracy = 98.35%


# Linear Regression

In [185]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [205]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [219]:
def log_loss_and_acc(model, name):
    probas = sigmoid(model.predict(X))
    loss = log_loss(Y, probas)
    acc = np.mean(np.where(probas > 0.5, 1, 0) == Y)
    print(f'{name} -> log-loss = {loss}, accuracy = {round(100*acc, 5)}%')

In [224]:
reg = LinearRegression().fit(X, Y)
log_loss_and_acc(reg, 'Linear Regression')

Linear Regression -> log-loss = 0.684693518365348, accuracy = 52.9703%


In [228]:
ridge = Ridge().fit(X, Y)
log_loss_and_acc(ridge, 'Ridge Regression')

Ridge Regression -> log-loss = 0.6911119147182496, accuracy = 52.14521%


In [233]:
lasso = Lasso(alpha=1e-3).fit(X, Y)
log_loss_and_acc(lasso, 'Lasso Regression')

Lasso Regression -> log-loss = 0.6981788201306711, accuracy = 51.32013%


  model = cd_fast.enet_coordinate_descent(
