In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
b = datasets.load_boston()

bos = pd.DataFrame(b.data)
bos.columns = b.feature_names
X = bos[bos.columns]
bos["PRICE"] = b.target
y = bos["PRICE"]

bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [5]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R^2: {r2}")
print(f"MSE: {mse}")

R^2: 0.635463843320211
MSE: 29.78224509230252


# Support Vector Machine

In [6]:
from sklearn.svm import SVR

svr = SVR(kernel="linear")
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

In [7]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R^2: {r2}")
print(f"MSE: {mse}")

R^2: 0.5635479105806482
MSE: 35.657705991432444


# XGBoost

In [8]:
import xgboost as xgb

# read in data
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

bst = xgb.train({} , dtrain)

# make prediction
y_pred = bst.predict(dtest)

In [9]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R^2: {r2}")
print(f"MSE: {mse}")

R^2: 0.7235488865082431
MSE: 22.585783789023868


# Combinations

In [10]:
reg = LinearRegression()
def linear_regression(X_train, X_test, y_train, y_test):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse

In [11]:
svr = SVR(kernel="linear")
def svr_(X_train, X_test, y_train, y_test):
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse

In [12]:
import xgboost as xgb

def xgb_(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    bst = xgb.train({} , dtrain)
    y_pred = bst.predict(dtest)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse

In [13]:
from itertools import combinations

all_combs = []
all_X_train = []
all_X_test = []
all_y_train = []
all_y_test = []

for n in range(1, 14):
    for combs in list(combinations(X, n)):
        lst_combs = list(combs)
        all_combs.append(lst_combs)
        X_feat = bos[lst_combs]
        X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.25, random_state=0)
        all_X_train.append(X_train)
        all_X_test.append(X_test)
        all_y_train.append(y_train)
        all_y_test.append(y_test)

zip_data = zip(all_combs, all_X_train, all_X_test, all_y_train, all_y_test)
print("zip done")

zip done


In [14]:
max_r2_features_lr = [0,[]]
min_mse_features_lr = [float("inf"),[]]

max_r2_features_xgb = [0,[]]
min_mse_features_xgb = [float("inf"),[]]

curr_len = 0

for combs, X_train, X_test, y_train, y_test in zip_data:
    if len(combs) > curr_len:
        curr_len = len(combs)
        print(f"working on n={curr_len}")
        
    # linear regression
    r2_lr, mse_lr = linear_regression(X_train, X_test, y_train, y_test)
    if r2_lr > max_r2_features_lr[0]:
        max_r2_features_lr[0] = r2_lr
        max_r2_features_lr[1] = combs

    if mse_lr < min_mse_features_lr[0]:
        min_mse_features_lr[0] = mse_lr
        min_mse_features_lr[1] = combs
        
    # xgboost
    r2_xgb, mse_xgb = xgb_(X_train, X_test, y_train, y_test)
    if r2_xgb > max_r2_features_xgb[0]:
        max_r2_features_xgb[0] = r2_xgb
        max_r2_features_xgb[1] = combs

    if mse_xgb < min_mse_features_xgb[0]:
        min_mse_features_xgb[0] = mse_xgb
        min_mse_features_xgb[1] = combs

working on n=1
working on n=2
working on n=3
working on n=4
working on n=5
working on n=6
working on n=7
working on n=8
working on n=9
working on n=10
working on n=11
working on n=12
working on n=13


In [15]:
print("LINEAR REGRESSION")
print(f"MAX R^2: {max_r2_features_lr[0]} \n{max_r2_features_lr[1]}")
print()
print(f"MIN MSE: {min_mse_features_lr[0]} \n{min_mse_features_lr[1]}")

LINEAR REGRESSION
MAX R^2: 0.6369666437456338 
['CRIM', 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

MIN MSE: 29.65946777714562 
['CRIM', 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [16]:
print("XGBOOST")
print(f"MAX R^2: {max_r2_features_xgb[0]} \n{max_r2_features_xgb[1]}")
print()
print(f"MIN MSE: {min_mse_features_xgb[0]} \n{min_mse_features_xgb[1]}")

XGBOOST
MAX R^2: 0.774975818716418 
['CRIM', 'NOX', 'RM', 'TAX', 'LSTAT']

MIN MSE: 18.38425406061762 
['CRIM', 'NOX', 'RM', 'TAX', 'LSTAT']


In [None]:
max_r2_features_svr = [0,[]]
min_mse_features_svr = [float("inf"),[]]

curr_len = 0

for combs, X_train, X_test, y_train, y_test in zip_data:
    if len(combs) > curr_len:
        curr_len = len(combs)
        print(f"working on n={curr_len}")
        
    # svr
    r2_svr, mse_svr = svr_(X_train, X_test, y_train, y_test)
    if r2_svr > max_r2_features_svr[0]:
        max_r2_features_svr[0] = r2_svr
        max_r2_features_svr[1] = combs

    if mse_svr < min_mse_features_svr[0]:
        min_mse_features_svr[0] = mse_svr
        min_mse_features_svr[1] = combs

working on n=1
working on n=2
working on n=3


In [None]:
print("SVR")
print(f"MAX R^2: {max_r2_features_svr[0]} \n{max_r2_features_svr[1]}")
print()
print(f"MIN MSE: {min_mse_features_svr[0]} \n{min_mse_features_svr[1]}")