# Logistic Regression

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd

# # supress warnings
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('E:/my-projs/wall-street-sentiments/data/train_test_data/train_test_data_new.csv')

In [3]:
# Remove columns that are not needed for modelling

data = data.drop(
    ['name', 
     'ticker', 
     'timestamp',
    'opening_price',
   'closing_price',
   'rank_24h_ago',
   'mentions_24h_ago',
   'rank', 'dividend_exists'
    ], axis=1).sort_values(
    by=['mentions'], 
    ascending=False)

In [4]:
data.dropna(inplace=True)

## Initial model

In [5]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('label', axis=1), data['label'], test_size=0.35, random_state=42)

In [6]:
model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.25      0.34        59
           1       0.58      0.81      0.67        74

    accuracy                           0.56       133
   macro avg       0.55      0.53      0.51       133
weighted avg       0.55      0.56      0.53       133



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Training error

y_pred_train = model.predict(x_train)

print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.57      0.29      0.38       111
           1       0.59      0.82      0.69       136

    accuracy                           0.58       247
   macro avg       0.58      0.56      0.53       247
weighted avg       0.58      0.58      0.55       247



## Some initial preprocessing

In [8]:
# mean 0 and std 1

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.29      0.35        59
           1       0.56      0.73      0.64        74

    accuracy                           0.53       133
   macro avg       0.51      0.51      0.49       133
weighted avg       0.52      0.53      0.51       133



## Hyper-parameter tuning

In [9]:
# random search cv
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 2500, 5000, 50000],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(x_train, y_train)

grid_search.best_params_

grid_search.best_score_

y_pred = grid_search.predict(x_test)

print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 250 candidates, totalling 1250 fits


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        59
           1       0.55      0.96      0.70        74

    accuracy                           0.53       133
   macro avg       0.27      0.48      0.35       133
weighted avg       0.30      0.53      0.39       133



375 fits failed out of a total of 1250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
125 fits failed with the following error:
Traceback (most recent call last):
  File "e:\my-projs\wall-street-sentiments\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\my-projs\wall-street-sentiments\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "e:\my-projs\wall-street-sentiments\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' p

In [10]:
grid_search.best_params_

{'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}

In [11]:
# Training Error

y_pred_train = grid_search.predict(x_train)

print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.75      0.03      0.05       111
           1       0.56      0.99      0.71       136

    accuracy                           0.56       247
   macro avg       0.65      0.51      0.38       247
weighted avg       0.64      0.56      0.42       247

