# HW 2
By Steven Jia

## 1. Gradient Boosting
Create your class that implements the Gradient Boosting concept, based on the locally weighted regression method (Lowess class), and that allows a user-prescribed number of boosting steps. The class you develop should have all the mainstream useful options, including “fit,” “is_fitted”,  and “predict,” methods.  Show applications with real data for regression, 10-fold cross-validations and compare the effect of different scalers, such as the “StandardScaler”, “MinMaxScaler”, and the “QuantileScaler”.  In the case of the “Concrete” data set, determine a choice of hyperparameters that yield lower MSEs for your method when compared to the eXtream Gradient Boosting library.

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.spatial.distance import cdist
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import mean_squared_error as mse

### Functions and classes from class notes

In [2]:
# Gaussian Kernel
def Gaussian(x):
  return np.where(np.abs(x)>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*x**2))

In [3]:
# this is the correct vectorized version
def Tricubic(x):
  return np.where(np.abs(x)>1,0,(1-np.abs(x)**3)**3)

In [4]:
# Epanechnikov Kernel
def Epanechnikov(x):
  return np.where(np.abs(x)>1,0,3/4*(1-np.abs(x)**2))

In [5]:
# Quartic Kernel
def Quartic(x):
  return np.where(np.abs(x)>1,0,15/16*(1-np.abs(x)**2)**2)

In [6]:
def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(cdist(u, v, metric='euclidean')/(2*tau))

In [7]:
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = []
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test.append(lm.predict([x_new[i]]))
        return np.array(yest_test).flatten()

### My gradient boost class

In [20]:
class Gradient_Boosting:
  def __init__(self, boost_rounds = 1, kernel = Gaussian, tau=0.05):
    self.boosts = boost_rounds
    self.iter = iter
    self.fitted = False
    self.kernel = kernel
    self.tau = tau

  def fit(self, x, y):
    self.fitted = True
    self.models = [Lowess(kernel = self.kernel, tau = self.tau)]
    for iterations in range(0, self.boosts):
      self.models.append(Lowess())
    current_y = y
    for model in self.models:
      model.fit(x, current_y)
      current_y = current_y - model.predict(x)

  def __sklearn_is_fitted__(self):
    return self.fitted

  def predict(self, x_new):
    check_is_fitted(self)
    output = self.models[0].predict(x_new)
    for index in range(1, len(self.models)):
      output += self.models[index].predict(x_new)
    return output

### Comparing to XGBRegressor

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA 441/concrete.csv')

In [11]:
data

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [12]:
x = data.drop(columns=['strength']).values
y = data['strength'].values

### Testing Scalers

In [13]:
StandardScale = StandardScaler()
MinMaxScale = MinMaxScaler()
QuantileScale = QuantileTransformer()

In [25]:
mseStandard = []
mseMinMax = []
mseQuantile = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
# model_rf = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)
model_gb = Gradient_Boosting()

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain].ravel()
  ytest = y[idxtest].ravel()
  xtest = x[idxtest]

  xtrainStandard = StandardScale.fit_transform(xtrain)
  xtestStandard = StandardScale.transform(xtest)

  xtrainMinMax = MinMaxScale.fit_transform(xtrain)
  xtestMinMax = MinMaxScale.transform(xtest)

  xtrainQuantile = QuantileScale.fit_transform(xtrain)
  xtestQuantile = QuantileScale.transform(xtest)

  model_gb.fit(xtrainStandard,ytrain)
  yhatStandard = model_gb.predict(xtestStandard)

  model_gb.fit(xtrainMinMax,ytrain)
  yhatMinMax = model_gb.predict(xtestMinMax)

  model_gb.fit(xtrainQuantile,ytrain)
  yhatQuantile = model_gb.predict(xtestQuantile)

  mseStandard.append(mse(ytest,yhatStandard))
  mseMinMax.append(mse(ytest,yhatMinMax))
  mseQuantile.append(mse(ytest,yhatQuantile))

print('The Cross-validated Mean Squared Error for Standard Scaling is : '+str(np.mean(mseStandard)))
print('The Cross-validated Mean Squared Error for the Min Max Scaling is: '+str(np.mean(mseMinMax)))
print('The Cross-validated Mean Squared Error for the Quantile Scaling is: '+str(np.mean(mseQuantile)))



The Cross-validated Mean Squared Error for Standard Scaling is : 825.4143660465567
The Cross-validated Mean Squared Error for the Min Max Scaling is: 85.5176139670593
The Cross-validated Mean Squared Error for the Quantile Scaling is: 137.75533404779452


Interesting that different scalers change the MSE so much. It seems that changing the scaler also requires refinement of the hyperparameters to fit that specific scaler.

In [70]:
mse_gb = []
mse_rf = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
model_rf = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)
model_gb = Gradient_Boosting(boost_rounds = 5, kernel = Gaussian, tau = 0.05)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain].ravel()
  ytest = y[idxtest].ravel()
  xtest = x[idxtest]
  xtrain = MinMaxScale.fit_transform(xtrain)
  xtest = MinMaxScale.transform(xtest)

  model_gb.fit(xtrain,ytrain)
  yhat_gb = model_gb.predict(xtest)

  model_rf.fit(xtrain,ytrain)
  yhat_rf = model_rf.predict(xtest)

  mse_gb.append(mse(ytest,yhat_gb))
  mse_rf.append(mse(ytest,yhat_rf))
print('The Cross-validated Mean Squared Error for Gradient Boosting is : '+str(np.mean(mse_gb)))
print('The Cross-validated Mean Squared Error for the eXtream Gradient Boosting is: '+str(np.mean(mse_rf)))

The Cross-validated Mean Squared Error for Gradient Boosting is : 77.14154812451821
The Cross-validated Mean Squared Error for the eXtream Gradient Boosting is: 23.056159469445554


Despite my best efforts, I could not find a way to beat the eXtream Gradient Boost.

## 2. K-nearest neighbors

Based on the Usearch library, create your own class that computes the k_Nearest Neighbors for Regression.

In [26]:
# !pip install usearch

Collecting usearch
  Downloading usearch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: usearch
Successfully installed usearch-2.9.0


In [28]:
from usearch.index import search, MetricKind, Matches, BatchMatches
from collections import Counter

My implementation assumes the last column of the passed in matrix is the classifications for the vectors.

In [51]:
class Nearest_Neighbors:
  def __init__(self, vectors, k=10):
    self.classification = vectors[:, -1]
    self.vectors = np.delete(vectors, -1, axis=1)
    self.k = k
    self.output = []

  def predict(self, x_new):
    output: Matches = search(self.vectors, x_new, self.k, MetricKind.L2sq, exact=True)
    nearest = self.classification[output.keys]
    counts = Counter(nearest)
    return max(counts, key=counts.get)

### A test of my implementation

In [65]:
vectors = np.random.rand(10000, 5).astype(np.float32)

For determining a class, I round the sum of all the features.

In [66]:
classification = np.round(np.sum(vectors, axis = 1))

In [67]:
vectors = np.hstack((vectors, classification[:, np.newaxis]))
vectors

array([[0.38437238, 0.7012594 , 0.9304328 , 0.40830556, 0.2553192 ,
        3.        ],
       [0.5399805 , 0.26744857, 0.35217568, 0.9855328 , 0.07068601,
        2.        ],
       [0.01432674, 0.68896276, 0.9213818 , 0.526632  , 0.72499824,
        3.        ],
       ...,
       [0.06500737, 0.46909127, 0.90405923, 0.62479186, 0.74778247,
        3.        ],
       [0.9587981 , 0.5755314 , 0.8257162 , 0.7197127 , 0.597727  ,
        4.        ],
       [0.13823645, 0.77575755, 0.8459041 , 0.8726179 , 0.19502705,
        3.        ]], dtype=float32)

In [68]:
nnModel = Nearest_Neighbors(vectors)

In [69]:
nnModel.predict(np.random.rand(5).astype(np.float32))

2.0