In [12]:
# Importing required libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd

from cleaning import clean_data
from evaluate import qwk


from sklearn import linear_model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, VarianceThreshold

from sklearn import ensemble


In [2]:
# Generating 3 datasets that reduce collinearity in the features
# Load Data
X_train, y_train, X_test, y_test = clean_data('')

# PCA Decomposition
pca = PCA(svd_solver='full')
pcaX_train = pca.fit_transform(X_train)
pcaX_test = pca.transform(X_test)

# Select K Best
kb = SelectKBest()
kbX_train = kb.fit_transform(X_train, y_train)
kbX_test = kb.transform(X_test)

# Variance Threshold
vt = VarianceThreshold(threshold=(.8 * (1 - .8)))
vtX_train = vt.fit_transform(X_train, y_train)
vtX_test = vt.transform(X_test)

print(f'Training = {X_train.shape}')
print(f'Testing = {X_test.shape}')
# Create dict to store outcomes


Training = (10495, 335)
Testing = (4498, 335)


In [6]:
# Testing which dataset produces the best result on the linear regressor
controlLogReg = ensemble.GradientBoostingRegressor()
kbLogReg = ensemble.GradientBoostingRegressor()
pcaLogReg = ensemble.GradientBoostingRegressor()
vtLogReg = ensemble.GradientBoostingRegressor()

controlLogReg.fit(X_train, y_train)
kbLogReg.fit(kbX_train, y_train)
pcaLogReg.fit(pcaX_train, y_train)
vtLogReg.fit(vtX_train, y_train)

print("============= control ======================")
print("train acc: " + str(controlLogReg.score(X_train, y_train)))
print("test acc: " + str(controlLogReg.score(X_test, y_test)))

print("============= K best ======================")
print("train acc: " + str(kbLogReg.score(kbX_train, y_train)))
print("test acc: " + str(kbLogReg.score(kbX_test, y_test)))

print("============= PCA Decomposition ======================")
print("train acc: " + str(pcaLogReg.score(pcaX_train, y_train)))
print("test acc: " + str(pcaLogReg.score(pcaX_test, y_test)))

print("============= Variance Threshold ======================")
print("train acc: " + str(vtLogReg.score(vtX_train, y_train)))
print("test acc: " + str(vtLogReg.score(vtX_test, y_test)))


train acc: 0.2004869603016015
test acc: 0.15395712010749807
train acc: 0.170004889558243
test acc: 0.14088230357018072
train acc: 0.2501164710083923
test acc: 0.11180560345203883
train acc: 0.1894034186214375
test acc: 0.1488303067115362


In [10]:
# Trying a rounding function to sort the results into buckets
# Using the best test result from the test above

def roundGuess(guesses):
    for i, guess in enumerate(guesses):
        if guess < 0.5:
            guesses[i] = 0
        elif guess < 1.5:
            guesses[i] = 1
        elif guess < 2.5:
            guesses[i] = 2    
        elif guess < 3.5:
            guesses[i] = 3
        elif guess < 4.5:
            guesses[i] = 4
        else: guesses[i] = 5
    return guesses     

preds = controlLogReg.predict(X_test)
roundedPreds = roundGuess(preds)

print("Acc without rounding: " + str(mean_squared_error(preds, y_test)))
print("Acc with rounding: " + str(mean_squared_error(roundedPreds, y_test)))


[2. 3. 3. ... 2. 2. 3.]
[2. 3. 3. ... 2. 2. 3.]
Acc without rounding: 1.227879057358826
Acc with rounding: 1.227879057358826


In [19]:
# Trying different learning rates and max depths on the regressor
results = pd.DataFrame(columns=range(1,20))
lrs = []
for depth in range(1,20):
    print("\t" + str(depth))
    for lr in [0.001, 0.01, 0.1, 0.2, 0.5, 1]:
        reg = ensemble.GradientBoostingRegressor(learning_rate=lr, max_depth=depth)
        reg.fit(X_train, y_train)
        print(reg.score(X_test, y_test))
        lrs.append(reg.score(X_test, y_test))
    # results[depth] = lrs
print(results)


	1
0.007463090457218158
0.04682182383238076
0.11792963211745855
0.13399694160461328
0.14634336116149993
0.14176281702237448
	2
0.015511288142164514
0.07871995326100567
0.14443142805999398
0.1520415732004542
0.15387764774714752
0.13282311323364648
	3
0.018493198360860386
0.09665569868957635
0.15401421483429723
0.15830683198204787
0.14715817185513635
0.024335937319792555
	4
0.021455815662935862
0.11085707019025626
0.16009899389545446
0.1604066554363962
0.12783467234801416
0.0014532542909005963
	5
0.02353279661233132
0.11805149999664277
0.1618756872672782
0.1620170367629966
0.11945888933713589
-0.2403040138126824
	6
0.02491662628418756
0.12155136666258715
0.15978943104323018
0.15489349528717355
0.051888428598280134
-0.30878954341207954
	7
0.02640631671233229
0.12492258989843896
0.1549744243961595
0.14708540560277494
-0.03011852454628161
