In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score, f1_score, precision_score, balanced_accuracy_score
from sklearn.utils import class_weight

In [4]:
df = pd.read_csv('../data/data2.csv')
X = df.drop(['severity_class'], axis=1)
y = df['severity_class'] - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)
weights = np.array([class_weights[i] for i in y_train])

In [10]:
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)

balance = balanced_accuracy_score(y_test, y_pred)
print(balance)

0.11131991840670095


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Make this work with 5-fold Cross Validation

In [11]:
from sklearn.model_selection import KFold, GridSearchCV

In [21]:
X_train1, X_test1 = X.iloc[:int(0.8 * len(X))], X.iloc[int(0.8 * len(X)):]
y_train1, y_test1 = y.iloc[:int(0.8 * len(y))], y.iloc[int(0.8 * len(y)):]

X_train2 = np.concatenate([X.iloc[:int(0.6 * len(X))], X.iloc[int(0.8 * len(X)):]])
X_test2  = X.iloc[int(0.6 * len(X)):int(0.8 * len(X))]
y_train2 = np.concatenate([y.iloc[:int(0.6 * len(X))], y.iloc[int(0.8 * len(X)):]])
y_test2  = y.iloc[int(0.6 * len(X)):int(0.8 * len(X))]

In [29]:
logReg = LogisticRegression(
    penalty='l1',       # Penalty: adds another term to the loss function;
                        # for Lin/Log Regression - loss is the Sum of Least squares; 
                        # L1 = drops features/sets the weights to 0, L2 = makes the weights small. 
    solver='liblinear', # Solver: changes the computational algortihm/procedure to fit the model/achieve the smallest loss. 
    C = 10,             # C = 1/lambda: the coefficient of the penalty
)



logReg.fit(X_train2, y_train2)
y_pred2 = logReg.predict(X_test2)

balance = balanced_accuracy_score(y_test2, y_pred2)
print(balance*100)

11.074256666375096




In [31]:
cv = KFold(n_splits = 5, shuffle=False)  ### Specifies the k=5, number of splits in the cross-validation procedure

logReg = LogisticRegression() ### Specify the model

params = {
    'penalty':['l1','l2'],
    'solver': ['liblinear'],
    'C': [0.1, 1, 10, 100],
}

grid_search = GridSearchCV(
    estimator=logReg,       # Model
    param_grid=params,      # Parameter Grid: 
    scoring = 'balanced_accuracy',
    cv=cv,
    verbose=2,              # Print progression: 1 = very brief, 2 = detailed, 0 = none
    n_jobs=6,               # Number of processors to use from my CPU
)

grid_search.fit(X, y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   1.6s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   2.3s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   2.3s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   2.4s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   3.1s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=   3.1s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   2.1s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   2.0s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   2.1s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   2.1s
[CV] END ..................C=1, penalty=l2, solver=liblinear; total time=   3.8s
[CV] END ..................C=1, penalty=l2, solve

In [32]:
print('Best Hyperparameters:')
print()
best_params = grid_search.best_params_
for param, value in best_params.items():
    print(f"{param}: {value}"+',  ')

Best Hyperparameters:

C: 100,  
penalty: l1,  
solver: liblinear,  


In [41]:
best_model = LogisticRegression(
    C=100,  
    penalty='l1',  
    solver='liblinear',
    class_weight=weights,
    )
best_model.fit(X_train1, y_train1)
y_pred = best_model.predict(X_test1)

balance = balanced_accuracy_score(y_test1, y_pred)
print(balance*100)

InvalidParameterError: The 'class_weight' parameter of LogisticRegression must be an instance of 'dict', a str among {'balanced'} or None. Got array([3.98809524, 0.19304463, 0.60953421, ..., 0.19304463, 0.19304463,
       0.19304463]) instead.

In [40]:
best_params

{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}