In [26]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

from cleaning import clean_data
from evaluate import qwk


from sklearn import linear_model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn import ensemble

In [6]:
# Reduce collinearity of features
# Load Data
X_train, y_train, X_test, y_test = clean_data('')

# PCA Decomposition
pca = PCA(svd_solver='full')
pcaX_train = pca.fit_transform(X_train)
pcaX_test = pca.transform(X_test)

# Select K Best
kb = SelectKBest()
kbX_train = kb.fit_transform(X_train, y_train)
kbX_test = kb.transform(X_test)

# Variance Threshold
vt = VarianceThreshold(threshold=(.8 * (1 - .8)))
vtX_train = vt.fit_transform(X_train, y_train)
vtX_test = vt.transform(X_test)

print(f'Training = {X_train.shape}')
print(f'Testing = {X_test.shape}')
# Create dict to store outcomes

results = {}

Training = (10495, 335)
Testing = (4498, 335)


In [20]:
# Fit linear models with increasing l2 penalties
ols = linear_model.LinearRegression()
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
elastic = linear_model.ElasticNet()



logReg = ensemble.GradientBoostingRegressor()
logReg = ensemble.GradientBoostingRegressor()


ols.fit(kbX_train, y_train)
ridge.fit(kbX_train, y_train)
lasso.fit(kbX_train, y_train)
elastic.fit(kbX_train, y_train)
logReg.fit(kbX_train, y_train)
ols_pred = ols.predict(kbX_test)
ridge_pred = ridge.predict(kbX_test)
lasso_pred = lasso.predict(kbX_test)
elastic_pred = elastic.predict(kbX_test)

def roundGuess(guesses):
    for i, guess in enumerate(guesses):
        if guess < 0.5:
            guesses[i] = 0
        elif guess < 1.5:
            guesses[i] = 1
        elif guess < 2.5:
            guesses[i] = 2    
        elif guess < 3.5:
            guesses[i] = 3
        elif guess < 4.5:
            guesses[i] = 4
        else: guesses[i] = 5
    return guesses     

# roundedGuess = roundGuess(ols_pred)
# print(mean_squared_error(roundedGuess, y_test))

print("============= Ord. Least Squares ======================")
print("train acc: " + str(ols.score(kbX_train, y_train)))
print("test acc: " + str(ols.score(kbX_test, y_test)))
# print("qwk: " + str(qwk(y_test, ols_pred)))

print("============= Lasso ======================")
print("train acc: " + str(lasso.score(kbX_test, y_test)))
# print("qwk: " + str(qwk(y_test, lasso_pred)))

print("============= Ridge ======================")
print("test acc: " + str(ridge.score(kbX_test, y_test)))
# print("qwk: " + str(qwk(y_test, ridge_pred)))

print("============= Elastic ======================")
print("train acc: " + str(elastic.score(kbX_train, y_train)))
print("test acc: " + str(elastic.score(kbX_test, y_test)))# print("qwk: " + str(qwk(y_test, elastic_pred)))

print("============= LogReg ======================")
print("train acc: " + str(logReg.score(kbX_train, y_train)))
print("test acc: " + str(logReg.score(kbX_test, y_test)))# print("qwk: " + str(qwk(y_test, elastic_pred)))



train acc: 0.07043568746413342
test acc: 0.06251794577351222
train acc: -5.849490605447372e-06
test acc: 0.06251845952326063
train acc: 0.0
test acc: -5.849490605447372e-06
train acc: 0.1700048895582429
test acc: 0.140889442175558


In [32]:
# Fit classification models
log = linear_model.LogisticRegression( max_iter=1000)
CVlog = linear_model.LogisticRegressionCV(max_iter=1000)
SGD = linear_model.SGDClassifier()
GBClass = ensemble.GradientBoostingClassifier()

log.fit(vtX_train, y_train)
CVlog.fit(vtX_train, y_train)
SGD.fit(vtX_train, y_train)
GBClass.fit(vtX_train, y_train)



print("============= Logistic Regression ======================")
print("test acc: " + str(log.score(vtX_train, y_train)))
print("train acc: " + str(log.score(vtX_test, y_test)))

print("============= Cross Validated Logistic Regression ======================")
print("test acc: " + str(CVlog.score(vtX_train, y_train)))
print("train acc: " + str(CVlog.score(vtX_test, y_test)))


print("============= SGD Classifier ======================")
print("test acc: " + str(SGD.score(vtX_train, y_train)))
print("train acc: " + str(SGD.score(vtX_test, y_test)))

print("============= Gradient Boosted Regressor Classifier ======================")
print("test acc: " + str(GBClass.score(vtX_train, y_train)))
print("train acc: " + str(GBClass.score(vtX_test, y_test)))

test acc: 0.3609337779895188
train acc: 0.3554913294797688
test acc: 0.3613149118627918
train acc: 0.35437972432192083
test acc: 0.29909480705097663
train acc: 0.29702089817696753
test acc: 0.4479275845640781
train acc: 0.3930635838150289


In [33]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [34]:

# Borrow an optimized Rounding function for regression buckets from https://www.kaggle.com/code/abhishek/maybe-something-interesting-here/notebook
from functools import partial

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        print(loss_partial)
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [31]:
# Fit all 3 models on all 3 datasets using the above rounding functionality

results = {}

datasets = [(vtX_train, vtX_test), (kbX_train, kbX_test), (pcaX_train, pcaX_test)]
models = [ols, ridge, lasso, elastic, logReg]
i = 0
for model in models:
    for X_train, X_test in datasets:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        optR = OptimizedRounder()
        optR.fit(predictions, y_test)

        coefs = optR.coefficients()
        valid_p = optR.predict(predictions, coefs)
        
        results[i] = model.score(X_test, y_test)
        i += 1

print(results)

functools.partial(<bound method OptimizedRounder._kappa_loss of <__main__.OptimizedRounder object at 0x000001EA50A56430>>, X=array([2.14977434, 2.67040106, 3.0850211 , ..., 2.53606072, 2.41704703,
       2.60152644]), y=array([2, 1, 4, ..., 4, 2, 1], dtype=int64))
functools.partial(<bound method OptimizedRounder._kappa_loss of <__main__.OptimizedRounder object at 0x000001EA524359A0>>, X=array([2.14960777, 2.63195443, 3.15550627, ..., 2.63080459, 2.49138859,
       2.58038141]), y=array([2, 1, 4, ..., 4, 2, 1], dtype=int64))
functools.partial(<bound method OptimizedRounder._kappa_loss of <__main__.OptimizedRounder object at 0x000001EA50A56430>>, X=array([2.18238178, 2.73312203, 3.13962531, ..., 2.53461321, 2.31685351,
       2.72429918]), y=array([2, 1, 4, ..., 4, 2, 1], dtype=int64))
functools.partial(<bound method OptimizedRounder._kappa_loss of <__main__.OptimizedRounder object at 0x000001EA52435AC0>>, X=array([2.1498433 , 2.67022555, 3.08464681, ..., 2.53605723, 2.4169789 ,
       2

KeyboardInterrupt: 

In [35]:
# NOTES
# Problem can be treated as both linear and classifier problem. Not good in linear models. Can bucket predictions into classes.
import sklearn.ensemble.GradientBoostingRegressor

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'learning_rate': 0.005,
    'subsample': .8,
    'colsample_bytree': 0.8,
    'min_split_gain': 0.006,
    'min_child_samples': 150,
    'min_child_weight': 0.1,
    'max_depth': 17,
    'n_estimators': 10000,
    'num_leaves': 80,
    'silent': -1,
    'verbose': -1,
    'max_depth': 11,
    'random_state': 2018
    }

lgbmodel = lgb.LGBMRegressor(**lgb_params)
lgbmodel.fit(
    vtX_train, y_train,
    eval_set=[(vtX_test, y_test)],
    eval_metric='rmse',
    verbose=100,
    early_stopping_rounds=100
)

#model.fit(xtrain, ytrain)
valid_preds = lgbmodel.predict(vtX_test, num_iteration=model.best_iteration_)
print(lgbmodel.score(vtX_test, y_tst))

ModuleNotFoundError: No module named 'sklearn.ensemble.GradientBoostingRegressor'

In [None]:


logReg = GradientBoostingRegressor()

logReg.fit(X_train, Y_train)

Y_pred = logReg.predict(X_test)

max = np.where(Y_pred == np. amax(Y_pred))

wrongValues = []
k = 0
for i, value in enumerate(Y_pred):
    if value > 10: 
        Y_pred[i] = 5
        k+=1
    if value < -5:
        Y_pred[i] = 0
        k+=1
print(k)
# for i in range(0,5):
#     print(Y_pred[np.random.randint(0, len(Y_pred))])
# print(len(Y_pred))
# print(len(Y_test))

results = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})

  

print(mean_squared_error(Y_test, roundGuess(Y_pred)))
print(r2_score(Y_test, Y_pred))



# NOTES: I tried rounding the guesses into discrete quantities, and using different regression models. Best R^2 I could get was 0.11 with gradient Boosted regression
# Need to remove collinearity and clean data more
# Need to try other models and maybe look at images
# Try to get an R^2 of at least 0.3 if you can. 
# Look into this https://www.kaggle.com/competitions/petfinder-adoption-prediction/discussion/87733


NameError: name 'Y_train' is not defined