In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

from cleaning import clean_data

from sklearn import linear_model

from sklearn.metrics import cohen_kappa_score

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn import ensemble

In [2]:
# Borrow an optimized Rounding function for regression buckets from https://www.kaggle.com/code/abhishek/maybe-something-interesting-here/notebook
from functools import partial

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = cohen_kappa_score(y, X_p, weights='quadratic')#quadratic_weighted_kappa(y, X_p)
        return -ll
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        #print(loss_partial)
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [3]:
def temp_acc(l1, l2):
    count = 0
    for i, e in enumerate(l1):
        if l1[i] == l2[i]:
            count += 1
    count /= len(l1)
    
    return count

In [4]:
# Reduce collinearity of features
# Load Data
X_train, y_train, X_test, y_test = clean_data()

# PCA Decomposition
pca = PCA(svd_solver='full')
pcaX_train = pca.fit_transform(X_train)
pcaX_test = pca.transform(X_test)

# Select K Best
kb = SelectKBest()
kbX_train = kb.fit_transform(X_train, y_train)
kbX_test = kb.transform(X_test)

# Variance Threshold
vt = VarianceThreshold(threshold=(.8 * (1 - .8)))
vtX_train = vt.fit_transform(X_train, y_train)
vtX_test = vt.transform(X_test)

ols = linear_model.LinearRegression()
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
elastic = linear_model.ElasticNet()
log = linear_model.LogisticRegression( max_iter=1000)
CVlog = linear_model.LogisticRegressionCV(max_iter=1000)
SGD = linear_model.SGDClassifier()
GBReg = ensemble.GradientBoostingRegressor()

# Fit all 3 models on all 3 datasets using the above rounding functionality
datasets = [(X_train, X_test),(vtX_train, vtX_test), (kbX_train, kbX_test), (pcaX_train, pcaX_test)]
models = [ols, ridge, lasso, elastic, log, CVlog, SGD, GBReg]
print('Original Data, Variance Threshold, KBest, PCA')
for model in models:
    train_results = []
    test_results = []
    kappa_scores = []
    for X_train, X_test in datasets:
        optR = OptimizedRounder()
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_train)
        optR.fit(predictions, y_train)
        coefs = optR.coefficients()
        valid_p = optR.predict(predictions, coefs)
        train_results.append(temp_acc(y_train, valid_p))
        
        predictions = model.predict(X_test)
        optR.fit(predictions, y_test)
        coefs = optR.coefficients()
        valid_p = optR.predict(predictions, coefs)
        test_results.append(temp_acc(y_test, valid_p))
        kappa_scores.append(cohen_kappa_score(y_test, valid_p, weights='quadratic'))
    print(type(model).__name__)
    print('Training Accuracy: ',train_results)
    print('Testing Accuracy: ', test_results)
    print('Kappa Scores: ', kappa_scores)

Original Data, Variance Threshold, KBest, PCA
LinearRegression
Training Accuracy:  [0.33187232015245355, 0.3149118627918056, 0.3204383039542639, 0.33120533587422585]
Testing Accuracy:  [0.26433970653623834, 0.33437083148065805, 0.25589150733659405, 0.32925744775455756]
Kappa Scores:  [0.20071679860245517, 0.2487597504151834, 0.16619604805307575, 0.2612912193601469]
Ridge
Training Accuracy:  [0.333968556455455, 0.3085278704144831, 0.3147212958551691, 0.333968556455455]
Testing Accuracy:  [0.33348154735437974, 0.2645620275678079, 0.2583370386838595, 0.33348154735437974]
Kappa Scores:  [0.27191564428320225, 0.17557598755677672, 0.16353638672920212, 0.27191564428320225]
Lasso
Training Accuracy:  [0.2191519771319676, 0.2191519771319676, 0.2191519771319676, 0.2191519771319676]
Testing Accuracy:  [0.21320586927523344, 0.21320586927523344, 0.21320586927523344, 0.21320586927523344]
Kappa Scores:  [0.0, 0.0, 0.0, 0.0]
ElasticNet
Training Accuracy:  [0.2191519771319676, 0.2191519771319676, 0.2191

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegressionCV
Training Accuracy:  [0.4042877560743211, 0.3612196283944736, 0.3513101476893759, 0.4038113387327299]
Testing Accuracy:  [0.36127167630057805, 0.35437972432192083, 0.35260115606936415, 0.3603823921742997]
Kappa Scores:  [0.2500331765434465, 0.2331388259025392, 0.2326076888260452, 0.24944887582657105]
SGDClassifier
Training Accuracy:  [0.3453072891853263, 0.31195807527394, 0.3036684135302525, 0.3494997617913292]
Testing Accuracy:  [0.316140506891952, 0.3156958648288128, 0.3159181858603824, 0.309693196976434]
Kappa Scores:  [0.1870296811344121, 0.14385386426366054, 0.10209141017479939, 0.11941216598226179]
GradientBoostingRegressor
Training Accuracy:  [0.3865650309671272, 0.3774178180085755, 0.3614101953311101, 0.4072415435921868]
Testing Accuracy:  [0.3523788350377946, 0.3485993775011116, 0.33770564695420185, 0.3470431302801245]
Kappa Scores:  [0.34718419646447674, 0.37686737310648055, 0.32702618165477926, 0.29641334290358967]
