In [4]:


import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
pd.set_option('display.max_rows', 500)


In [5]:
fields = ["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol","quality"]
data_wine = pd.read_csv(r"winequality-red.csv", sep=";")
print(len(data_wine.index))
data_wine.head()

data_cars = pd.read_csv("auto-mpg.csv")
data_cars.head()

def prepare_data(which='Wine'):
    if which == "Wine":
        ## split in test and train set
        mask = np.random.rand(len(data_wine)) < 0.8
        X_train = data_wine[mask]
        X_test = data_wine[~mask]

        y_train = pd.Series(X_train['fixed acidity'])
        y_test = pd.Series(X_test['fixed acidity'])

        X_train = X_train.drop(columns=['fixed acidity'])
        X_test = X_test.drop(columns=['fixed acidity'])

        print("X train:",len(X_train)," Y train:",len(y_train)," X Test: ",len(X_test)," Y test: ",len(y_test))

        return X_train,y_train,X_test,y_test
    
    elif which == "Cars":
        ## split in test and train set
        mask = np.random.rand(len(data_cars)) < 0.8
        X_train = data_cars[mask]
        X_test = data_cars[~mask]

        y_train = pd.Series(X_train['mpg'])
        y_test = pd.Series(X_test['mpg'])

        ## drop everything but horsepower and weight

        X_train = X_train.drop(columns=["cylinders", "displacement","acceleration","model","year","origin","car name"])
        X_test = X_test.drop(columns=["cylinders", "displacement","acceleration","model","year","origin","car name"])

        return X_train,y_train,X_test,y_test
    

1599


In [6]:


def prepare_dataset():
    boston = load_boston()
    print(type(boston))
    X_y = np.column_stack([boston['data'], boston['target']])
    np.random.seed(1)
    np.random.shuffle(X_y)
    X, y = X_y[:,:-1], X_y[:,-1]
    X_train, y_train, X_test, y_test = X[:400], y[:400], X[400:], y[400:]
    X_train = pd.DataFrame(X_train, columns=boston['feature_names'])
    X_test = pd.DataFrame(X_test, columns=boston['feature_names'])
    y_train = pd.Series(y_train, name='House Price')
    y_test = pd.Series(y_test, name='House Price')
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = prepare_data()
X_train.head()
# print(y_train)



X train: 1255  Y train: 1255  X Test:  344  Y test:  344


Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5


In [7]:
def rss(y_left, y_right):
    def squared_residual_sum(y):
        return np.sum((y - np.mean(y)) ** 2)
    
    return squared_residual_sum(y_left) + squared_residual_sum(y_right) 


def compute_rss_by_threshold(feature):
    features_rss = []
    thresholds = X_train[feature].unique().tolist()
    thresholds.sort()
    thresholds = thresholds[1:]
    for t in thresholds:
        y_left_ix = X_train[feature] < t
        y_left, y_right = y_train[y_left_ix], y_train[~y_left_ix]
        features_rss.append(rss(y_left, y_right))
    return thresholds, features_rss




def find_best_rule(X_train, y_train):
    best_feature, best_threshold, min_rss = None, None, np.inf
    for feature in X_train.columns:
        thresholds = X_train[feature].unique().tolist()
        thresholds.sort()
        thresholds = thresholds[1:]
        for t in thresholds:
            y_left_ix = X_train[feature] < t
            y_left, y_right = y_train[y_left_ix], y_train[~y_left_ix]
            t_rss = rss(y_left, y_right)
            if t_rss < min_rss:
                min_rss = t_rss
                best_threshold = t
                best_feature = feature
    
    return {'feature': best_feature, 'threshold': best_threshold}




def split(X_train, y_train, depth, max_depth):
    if depth == max_depth or len(X_train) < 2:
        return {'prediction': np.mean(y_train)}
    
    rule = find_best_rule(X_train, y_train)
    left_ix = X_train[rule['feature']] < rule['threshold']
    rule['left'] = split(X_train[left_ix], y_train[left_ix], depth + 1, max_depth)
    rule['right'] = split(X_train[~left_ix], y_train[~left_ix], depth + 1, max_depth)
    return rule




def predict(sample, rules):
    prediction = None
    while prediction is None:
        feature, threshold = rules['feature'], rules['threshold']
        if sample[feature] < threshold:
            rules = rules['left']
        else:
            rules = rules['right']
        prediction = rules.get('prediction', None)
    return prediction


def evaluate(X, y,rules):
    preds = X.apply(predict, axis='columns', rules=rules.copy())
    return r2_score(preds, y)

In [9]:


X_train, y_train, X_test, y_test = prepare_data()
for max_depth in range(3, 5):
    rules = split(X_train, y_train, 0, max_depth)
    train_r2 = evaluate(X_train, y_train,rules)
    test_r2 = evaluate(X_test, y_test,rules)
    print('Max Depth', max_depth, 'Training R2:', train_r2, 'Test R2:',test_r2)



X train: 1260  Y train: 1260  X Test:  339  Y test:  339
Max Depth 3 Training R2: 0.5789557561712936 Test R2: 0.5010734025981229
Max Depth 4 Training R2: 0.7031712001344493 Test R2: 0.6148629727585371
