In [2]:
import sys
sys.path.append('./')
import warnings
warnings.simplefilter("ignore")

from logRegression import LogRegression 

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [3]:
def cross_val_score(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred, pos_label='positive', average='micro')
        roc_score = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred))
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

# Data splitting

In [5]:
df = pd.read_csv('newRain.csv')
df.drop(['Unnamed: 0'], axis='columns', inplace=True)
X, Y = df.drop(['RainTomorrow'], axis=1), df['RainTomorrow']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [6]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train)), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test))
    
    print('Ошибки на выборках')
    print('            Train                   Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
    
def search_parametrs_C(C_list):
    best_C = None
    best_roc = 0
    
    for C in C_list:
        model = LogisticRegression(C=C)
        accuracy, precision, roc_auc = cross_val_score(model, X.values, Y.values)
        if roc_auc > best_roc:
            best_C = C
    return best_C

# Logistic Regression implementation

# Without regularization

In [7]:
model = LogRegression()

In [8]:
accuracy, precision, roc_auc = cross_val_score(model, X.values, Y.values)
print_error_validation(accuracy, precision, roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.4614
Precision = 0.4614
Roc auc   = 0.5


In [9]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.7528571428571429  |  0.752
Precision = 0.7528571428571429  |  0.752
Roc auc   = 0.5  |  0.5


# With regularization

In [10]:
model = LogRegression(C=0.9)

In [11]:
accuracy,precision,roc_auc = cross_val_score(model,X.values,Y.values)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.5558
Precision = 0.5558
Roc auc   = 0.5


In [12]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.7528571428571429  |  0.752
Precision = 0.7528571428571429  |  0.752
Roc auc   = 0.5  |  0.5


# Logistic Regression with Sklearn

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
scoring = ['precision_micro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_micro','test_roc_auc']

In [14]:
logreg = LogisticRegression(C=1e5)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.9986666666666667


In [15]:
logreg = LogisticRegressionCV(cv=5,  random_state=0).fit(x_train, y_train)#multi_class='ovr',
y_pred = logreg.predict(x_test)

print(f'Ошибки на кросс валидации:\nAccuracy = {accuracy_score(y_test, y_pred)} ')

Ошибки на кросс валидации:
Accuracy = 0.9986666666666667 


In [16]:
retrain(logreg, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.9997142857142857  |  0.9986666666666667
Precision = 0.9997142857142857  |  0.9986666666666667
Roc auc   = 0.999421965317919  |  0.9982126515671471
