In [82]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

def get_classifier_data():
    data=pd.read_csv("WineQT.csv")

    values=[[] for i in data]
    keys=[key for key in data]

    for i in range(len(data)):
        for key in range(len(keys)):
            values[key].append(data[keys[key]][i])

    X,Y=[],[]

    for i in range(len(data)):
        xvals=[]
        for key in keys:
            if key == "quality":
                if data[key][i]<=5:
                    yvals=[ 0 for i in range(6)]
                else:
                    yvals=[ 1 for i in range(6)]
                    
            else:
                xvals.append(data[key][i])
        X.append(xvals)
        Y.append(yvals)
    # Y=np.array(Y)
    # print(y)
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)
    scaler = MinMaxScaler()
    X= scaler.fit_transform(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    x_train,x_oth,y_train,y_oth=train_test_split(X,Y,test_size=0.3,random_state=42)
    x_val,x_test,y_val,y_test=train_test_split(x_oth,y_oth,test_size=0.1,random_state=42)
    y_train=np.array(y_train)
    return x_train,y_train,x_test,y_test,x_val,y_val

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

def get_regressor_data():
    data=pd.read_csv("HousingData.csv")

    keys=[key for key in data]
    column_means = data.mean()

    data = data.fillna(column_means)

    imputer = SimpleImputer(strategy='mean')
    data = imputer.fit_transform(data)
    values=[[] for i in range(len(data[0]))]
    # print(keys)

    for i in data:
        for j in range(len(i)):
            values[j].append(i[j])

    X=data[:,:-1]
    Y=data[:,-1]
    Y=Y.reshape(-1,1)

    scaler = MinMaxScaler()
    X= scaler.fit_transform(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    x_train,x_oth,y_train,y_oth=train_test_split(X,Y,test_size=0.3,random_state=42)
    x_val,x_test,y_val,y_test=train_test_split(x_oth,y_oth,test_size=0.1,random_state=42)

    y_train=y_train.reshape(-1,1)
    y_test=y_test.reshape(-1,1)
    y_val=y_val.reshape(-1,1)
    
    return x_train,y_train,x_test,y_test,x_val,y_val

# Random Forest

In [84]:

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score
from collections import Counter

class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_features='sqrt', bootstrap=True, fraction_samples=1.0):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.fraction_samples = fraction_samples
        self.estimators = []
        self.feature_indices = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.max_features=n_features
        n_subsamples = int(n_samples * self.fraction_samples)

        for _ in range(self.n_estimators):
            if self.bootstrap:
                sample_indices = np.random.choice(n_samples, n_subsamples, replace=True)
            else:
                sample_indices = np.random.choice(n_samples, n_subsamples, replace=False)

            feature_indices = np.random.choice(n_features, self.max_features, replace=False)
            self.feature_indices.append(feature_indices)
            X_subset = X[sample_indices][:, feature_indices]
            y_subset = y[sample_indices]
            estimator = DecisionTreeRegressor()
            estimator.fit(X_subset, y_subset)
            self.estimators.append(estimator)

    def predict(self, X):
        predictions = []
        for i, estimator in enumerate(self.estimators):
            feature_indices = self.feature_indices[i]
            X_subset = X[:, feature_indices]
            preds = estimator.predict(X_subset)
            predictions.append(preds)

        final_predictions = np.mean(np.array(predictions),axis=0)
        return final_predictions

class RandomForestClassifier:
    def __init__(self, n_estimators, bootstrap, fraction_samples):
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.fraction_samples = fraction_samples
        self.estimators = []
        self.feature_indices = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.max_features=n_features
        n_subsamples = int(n_samples * self.fraction_samples)

        for _ in range(self.n_estimators):
            if self.bootstrap:
                sample_indices = np.random.choice(n_samples, n_subsamples, replace=True)
            else:
                sample_indices = np.random.choice(n_samples, n_subsamples, replace=False)

            feature_indices = np.random.choice(n_features, self.max_features, replace=False)
            self.feature_indices.append(feature_indices)
            X_subset = X[sample_indices][:, feature_indices]
            y_subset = y[sample_indices]
            estimator = DecisionTreeClassifier()
            estimator.fit(X_subset, y_subset)
            self.estimators.append(estimator)

    def predict(self, X):
        predictions = []
        for i, estimator in enumerate(self.estimators):
            feature_indices = self.feature_indices[i]
            X_subset = X[:, feature_indices]
            preds = estimator.predict(X_subset)
            predictions.append(preds)
        final_predictions = (np.mean(np.array(predictions),axis=0)>=0.5).astype(int)
        return final_predictions


## Classifier

In [85]:
x_train,y_train,x_test,y_test,x_val,y_val = get_classifier_data()

results = []

for fraction in [0.15, 0.25, 0.5, 0.75, 1.0]:
    for num_estimators in [10, 50, 100, 200]:
        for bootstrap in [True, False]:
            rf_classifier = RandomForestClassifier(fraction_samples=fraction, bootstrap=bootstrap, n_estimators=num_estimators)
            rf_classifier.fit(x_train, y_train)
            preds=rf_classifier.predict(x_test)
            # print(len(preds))
            # print(len(y_test))
            score = accuracy_score(y_test, preds)
            results.append([score, fraction, num_estimators, bootstrap])

results.sort(reverse=True)
# print(results[:10])
for i in results[:10]:
    print("Accuracy:",i[0],"\t Number of Estimators:", i[2],"\t Bootstrap:",i[3],"\t Fraction:", i[1])

Accuracy: 0.8571428571428571 	 Number of Estimators: 50 	 Bootstrap: True 	 Fraction: 0.5
Accuracy: 0.8285714285714286 	 Number of Estimators: 200 	 Bootstrap: True 	 Fraction: 0.5
Accuracy: 0.8285714285714286 	 Number of Estimators: 200 	 Bootstrap: False 	 Fraction: 0.5
Accuracy: 0.8285714285714286 	 Number of Estimators: 200 	 Bootstrap: False 	 Fraction: 0.25
Accuracy: 0.8285714285714286 	 Number of Estimators: 50 	 Bootstrap: True 	 Fraction: 0.25
Accuracy: 0.8285714285714286 	 Number of Estimators: 50 	 Bootstrap: False 	 Fraction: 0.25
Accuracy: 0.8 	 Number of Estimators: 100 	 Bootstrap: True 	 Fraction: 1.0
Accuracy: 0.8 	 Number of Estimators: 200 	 Bootstrap: True 	 Fraction: 0.75
Accuracy: 0.8 	 Number of Estimators: 100 	 Bootstrap: False 	 Fraction: 0.75
Accuracy: 0.8 	 Number of Estimators: 50 	 Bootstrap: True 	 Fraction: 0.75


## Regressor

In [86]:
x_train,y_train,x_test,y_test,x_val,y_val = get_regressor_data()

results= []

for fraction in [0.15, 0.25, 0.5, 0.75, 1.0]:
    for num_estimators in [10, 50, 100, 200]:
        for bootstrap in [True, False]:
            rf_regressor = RandomForestRegressor(fraction_samples=fraction, bootstrap=bootstrap, n_estimators=num_estimators)
            rf_regressor.fit(x_train, y_train)
            preds=rf_regressor.predict(x_test)
            score = mean_squared_error(y_test, preds)
            results.append([score, fraction, num_estimators, bootstrap])

results.sort()
for i in results[:10]:
    print("Mean Squared Error:",i[0],"\t Number of Estimators:", i[2],"\t Bootstrap:",i[3],"\t Fraction:", i[1])

Mean Squared Error: 8.758999999999991 	 Number of Estimators: 10 	 Bootstrap: True 	 Fraction: 1.0
Mean Squared Error: 9.075852500000003 	 Number of Estimators: 50 	 Bootstrap: False 	 Fraction: 0.75
Mean Squared Error: 11.166206249999995 	 Number of Estimators: 10 	 Bootstrap: False 	 Fraction: 0.75
Mean Squared Error: 11.297455999999979 	 Number of Estimators: 100 	 Bootstrap: False 	 Fraction: 0.75
Mean Squared Error: 11.997474999999994 	 Number of Estimators: 10 	 Bootstrap: False 	 Fraction: 0.5
Mean Squared Error: 14.058440218749965 	 Number of Estimators: 200 	 Bootstrap: False 	 Fraction: 0.75
Mean Squared Error: 15.364267515625013 	 Number of Estimators: 200 	 Bootstrap: True 	 Fraction: 1.0
Mean Squared Error: 15.849339249999998 	 Number of Estimators: 50 	 Bootstrap: False 	 Fraction: 0.5
Mean Squared Error: 16.93525731249996 	 Number of Estimators: 100 	 Bootstrap: True 	 Fraction: 1.0
Mean Squared Error: 17.394356499999972 	 Number of Estimators: 50 	 Bootstrap: True 	 Fra

# Ada Boost

## Classifier

In [None]:
class AdaBoostClassifier:
    
    def __init__(self):
        self.alphas = []
        self.classifier = []
        self.num_of_iter = None
        self.training_errors = []
        self.prediction_errors = []
        
    def fit(self, X, y, num_of_iter):
        self.alphas = [] 
        self.training_errors = []
        self.num_of_iter = num_of_iter
        for m in range(0, num_of_iter):
            
            if m == 0:
                weights_i = np.ones(len(y)) * 1 / len(y)
            else:
                weights_i = self.update_weights(weights_i, alpha_m, y, y_pred)
            
            classifier = DecisionTreeClassifier(max_depth = 10)
            classifier.fit(X, y, sample_weight = weights_i)
            self.classifier.append(classifier)
            y_pred = classifier.predict(X)
            
            error_m = self.compute_error(y, y_pred, weights_i)
            self.training_errors.append(error_m)

            alpha_m = self.compute_alpha(error_m)
            self.alphas.append(alpha_m)

        # assert len(self.classifier) == len(self.alphas)
        
    def predict(self, X):
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.num_of_iter)) 

        for m in range(self.num_of_iter):
            y_pred_m = self.classifier[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

    def compute_error(self, y, y_pred, weights_i):
        return (sum(weights_i * (np.not_equal(y, y_pred)).astype(int)))/sum(weights_i)

    def compute_alpha(self, error):
        return np.log((1 - error) / error)

    def update_weights(self, weights_i, alpha, y, y_pred):
        return weights_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))
    

In [None]:
import time
import matplotlib.pyplot as plt

x_train,y_train,x_test,y_test,x_val,y_val = get_classifier_data()
y_train=y_train.tolist()
for i in range(len(y_test)):
    y_test[i]=y_test[i][0]
for i in range(len(y_train)):
    y_train[i]=y_train[i][0]
for i in range(len(y_val)):
    y_val[i]=y_val[i][0]

training_times = []
accuracy_values = []

num_estimators_list = [10, 50, 100, 200, 500]
for m in num_estimators_list:
    start_time = time.time()
    classifier = AdaBoostClassifier()
    classifier.fit(x_train, y_train, M = m)
    
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    y_pred = classifier.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    accuracy_values.append(score)

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, training_times, marker='o')
plt.title('Training Time vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Training Time (seconds)')

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, accuracy_values, marker='o', color='r')
plt.title('Accuracy vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')


## Regressor

In [None]:

class AdaBoostRegressor:
    
    def fit(self, X_train, y_train, total_iter, depth, random_state = None):
        
        self.y_train = y_train
        self.X_train = X_train
        self.total_iter = total_iter
        self.depth = depth
        self.N, self.D = X_train.shape
        self.weights = np.repeat(1/self.N, self.N)
        np.random.seed(random_state)
        
        self.trees = []    
        self.weak_weights = []
        self.fitted_values = np.empty((self.N, self.total_iter))
        for t in range(self.total_iter):
            
            bootstrap_indices = np.random.choice(np.arange(self.N), size = self.N, replace = True, p = self.weights)
            bootstrap_X = self.X_train[bootstrap_indices]
            bootstrap_y = self.y_train[bootstrap_indices]   
            
            tree = DecisionTreeRegressor(max_depth = depth)
            tree.fit(bootstrap_X, bootstrap_y)
            y_ = tree.predict(X_train)

            self.trees.append(tree)
            self.fitted_values[:,t] = y_
            
            abs_errors_t = np.abs(self.y_train - y_)
            max_abs_err = np.max(abs_errors_t)
            norm_errors = abs_errors_t/max_abs_err
            weighted_norm_err = np.sum(self.weights*norm_errors)

            if weighted_norm_err >= 0.5:
                self.T = t - 1
                self.fitted_values = self.fitted_values[:,:t-1]
                self.trees = self.trees[:t-1]
                continue
            
            weak_learner_weight = weighted_norm_err/(1 - weighted_norm_err)
            self.weak_weights.append(weak_learner_weight)

            norm_factor = np.sum(self.weights*weak_learner_weight**(1-norm_errors))

            self.weights *= weak_learner_weight**(1-norm_errors)/norm_factor
            
        self.model_weights = np.log(1/np.array(self.weak_weights))
        self.y_train_hat = np.array([self.weighted_median(self.fitted_values[n], self.model_weights) for n in range(self.N)])

    def weighted_median(self, values, weights):    
        sorted_indices = values.argsort()
        values = values[sorted_indices]
        weights = weights[sorted_indices]
        weights_cumulative_sum = weights.cumsum()
        median_weight = np.argmax(weights_cumulative_sum >= sum(weights)/2)
        return values[median_weight]

    def predict(self, X_test):
        N_test = len(X_test)
        fitted_values = np.empty((N_test, self.T))
        for t, tree in enumerate(self.trees):
            fitted_values[:,t] = tree.predict(X_test)
        return np.array([self.weighted_median(fitted_values[n], self.model_weights) for n in range(N_test)]) 
        
        

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

x_train,y_train,x_test,y_test,x_val,y_val = get_regressor_data()
x_train,y_train,x_test,y_test,x_val,y_val=np.array(x_train),np.array(y_train),np.array(x_test),np.array(y_test),np.array(x_val),np.array(y_val)

training_times=[]
accuracy_values=[]
num_estimators_list = [10, 50, 100, 200, 500]
for m in num_estimators_list:
    start_time = time.time()
    booster = AdaBoostRegressor()
    y_train=y_train.reshape(-1)
    booster.fit(x_train, y_train, T = m, stub_depth = 4, random_state = 123)

    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    score=mean_squared_error(y_test, booster.predict(x_test))
    accuracy_values.append(score)

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, training_times, marker='o')
plt.title('Training Time vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Training Time (seconds)')

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, accuracy_values, marker='o', color='r')
plt.title('Mean Squared Error (MSE) vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Squared Error')


# Gradient Boosted Decision Trees

## Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

class GradientBoostingClassifier:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=10):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        class_prob = np.bincount(y) / len(y)
        self.initial_prediction = np.log(class_prob[1] / class_prob[0])

        prediction = np.full(y.shape, self.initial_prediction)

        for i in range(self.n_estimators):
            class_prob = 1 / (1 + np.exp(-prediction))
            residuals = y - class_prob

            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            threshold = 0.5
            class_labels = np.where(residuals > threshold, 1, 0)
            tree.fit(X, class_labels)
            tree_pred = tree.predict(X)
            prediction = prediction + self.learning_rate * tree_pred

            self.trees.append(tree)

    def predict(self, X):
        prediction = np.full(X.shape[0], self.initial_prediction)
        
        for tree in self.trees:
            tree_pred = tree.predict(X)
            prediction += self.learning_rate * tree_pred
        
        prob = 1 / (1 + np.exp(-prediction))
        return np.where(prob >= 0.5, 1, 0)


In [None]:
x_train,y_train,x_test,y_test,x_val,y_val = get_classifier_data()

# print(y_train)
y_train=y_train.tolist()
for i in range(len(y_test)):
    y_test[i]=y_test[i][0]
for i in range(len(y_train)):
    y_train[i]=y_train[i][0]
for i in range(len(y_val)):
    y_val[i]=y_val[i][0]
y_train=np.array(y_train)

training_times=[]
num_estimators_list = [10, 50, 100, 200, 500]
for num_estimators in num_estimators_list:
    start_time = time.time()
    gb_classifier = GradientBoostingClassifier(n_estimators=num_estimators, learning_rate=0.1)
    gb_classifier.fit(x_train, y_train)

    
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    predictions = gb_classifier.predict(x_test)
    score=accuracy_score(y_test, predictions)
    accuracy_values.append(score)

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, training_times, marker='o')
plt.title('Training Time vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Training Time (seconds)')

plt.figure(figsize=(5, 3))
plt.plot(num_estimators_list, accuracy_values, marker='o', color='r')
plt.title('Accuracy vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')


## Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

class GradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.initial_prediction = np.mean(y)
        prediction = np.full(y.shape, self.initial_prediction)

        for i in range(self.n_estimators):
            residuals = y - prediction

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            tree_pred = tree.predict(X)

            tree_pred = np.reshape(tree_pred, prediction.shape)

            prediction = prediction + self.learning_rate * tree_pred

            self.trees.append(tree)

    def predict(self, X):
        prediction = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            tree_pred = tree.predict(X)
            prediction += self.learning_rate * tree_pred
        
        return prediction


In [None]:
import time
import matplotlib.pyplot as plt

x_train, y_train, x_test, y_test, x_val, y_val = get_regressor_data()

training_times = []
mse_values = []

num_estimators_list = [10, 50, 100, 200, 500]
for num_estimators in num_estimators_list:
    start_time = time.time()
    gb_regressor = GradientBoostingRegressor(n_estimators=num_estimators, learning_rate=0.1, max_depth=3)
    
    gb_regressor.fit(x_train, y_train)
    
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    predictions = gb_regressor.predict(x_test)

    mse = mean_squared_error(y_test, predictions)
    mse_values.append(mse)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(num_estimators_list, training_times, marker='o')
plt.title('Training Time vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Training Time (seconds)')

plt.subplot(1, 2, 2)
plt.plot(num_estimators_list, mse_values, marker='o', color='r')
plt.title('Mean Squared Error (MSE) vs Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Squared Error')