In [1]:
# Libraries
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import warnings

from sklearn.metrics         import mean_squared_error
from sklearn.ensemble        import RandomForestRegressor
from sklearn.ensemble        import AdaBoostRegressor
from sklearn.tree            import DecisionTreeRegressor
from sklearn.linear_model    import LinearRegression 


warnings.filterwarnings('ignore')

In [2]:
# local file paths

dir_name = 'selezione'
region_names = np.array(['A', 'B', 'C'])

fp_Xtrain = []
fp_Xval   = []
fp_Xtest  = []
fp_ytrain = []
fp_yval   = []
fp_ytest  = []

for i in range(3):
    fp_Xtrain.append(dir_name + f'/X_train{region_names[i]}.csv')
    fp_Xval  .append(dir_name + f'/X_val{  region_names[i]}.csv')
    fp_Xtest .append(dir_name + f'/X_test{ region_names[i]}.csv')
    fp_ytrain.append(dir_name + f'/y_train{region_names[i]}.csv')
    fp_yval  .append(dir_name + f'/y_val{  region_names[i]}.csv')
    fp_ytest .append(dir_name + f'/y_test{ region_names[i]}.csv')

In [3]:
# Lettura dei dati

X_train = []
X_val   = []
X_test  = []
y_train = []
y_val   = []
y_test  = []

for i in range(3):
    X_train.append(pd.read_csv(fp_Xtrain[i], low_memory=False))
    X_val  .append(pd.read_csv(fp_Xval  [i], low_memory=False))        
    X_test .append(pd.read_csv(fp_Xtest [i], low_memory=False))
    y_train.append(pd.read_csv(fp_ytrain[i], low_memory=False))
    y_val  .append(pd.read_csv(fp_yval  [i], low_memory=False))        
    y_test .append(pd.read_csv(fp_ytest [i], low_memory=False))
    
X_train = np.array(X_train, dtype=object)
X_val   = np.array(X_val,   dtype=object)
X_test  = np.array(X_test,  dtype=object)
y_train = np.array(y_train, dtype=object)
y_val   = np.array(y_val,   dtype=object)
y_test  = np.array(y_test,  dtype=object)

In [4]:
def dimensionality(y=False):
    for i in range(3):
        print(f'X_train{region_names[i]}: {X_train[i].shape}')
        print(f'X_val{region_names[i]}:   {X_val  [i].shape}')
        print(f'X_test{region_names[i]}:  {X_test [i].shape}')
        if y:
            print(f'y_train{region_names[i]}: {y_train[i].shape}')
            print(f'y_val{region_names[i]}:   {y_val  [i].shape}')
            print(f'y_test{region_names[i]}:  {y_test [i].shape}')
            print()

In [5]:
dimensionality(y=True)

X_trainA: (26819, 41)
X_valA:   (9006, 41)
X_testA:  (9085, 41)
y_trainA: (26819, 1)
y_valA:   (9006, 1)
y_testA:  (9085, 1)

X_trainB: (8119, 34)
X_valB:   (2658, 34)
X_testB:  (2606, 34)
y_trainB: (8119, 1)
y_valB:   (2658, 1)
y_testB:  (2606, 1)

X_trainC: (64771, 48)
X_valC:   (21908, 48)
X_testC:  (21876, 48)
y_trainC: (64771, 1)
y_valC:   (21908, 1)
y_testC:  (21876, 1)



In [6]:
dt_model = np.array([
    DecisionTreeRegressor(max_leaf_nodes = 2),
    DecisionTreeRegressor(max_leaf_nodes = 2),
    DecisionTreeRegressor(max_leaf_nodes = 2)
])

In [7]:
boost_model = np.array([
    AdaBoostRegressor(dt_model[0], n_estimators = 45),
    AdaBoostRegressor(dt_model[1], n_estimators = 54),
    AdaBoostRegressor(dt_model[1], n_estimators = 37)
])

In [8]:
rf_model = np.array([
    RandomForestRegressor(n_estimators = 45, n_jobs = -1),
    RandomForestRegressor(n_estimators = 38, n_jobs = -1),
    RandomForestRegressor(n_estimators = 51, n_jobs = -1)
])

In [9]:
models = np.array([
    dt_model,
    boost_model,
    rf_model
])

In [29]:
def gen_Y(X, y, models, col_names, index):
    Y = pd.DataFrame()
    for i in range(len(col_names)):
        models[i][index].fit(X[index], y[index])
        Y[col_names[i]] = models[i][index].predict(X[index])
    return Y

In [104]:
def gen_arrY(X, y, models, col_names):
    Y = []
    for i in range(len(X)):
        Y.append(gen_Y(X, y, models, col_names, i))
    Y = np.array(Y, dtype=object)
    return Y

In [105]:
col_names = ['DecisionTree', 'Boosting', 'RandomForest']

In [106]:
Y_train = gen_arrY(X_train, y_train, models, col_names)
Y_test  = gen_arrY(X_test,  y_test,  models, col_names)

In [109]:
def dimensionalityY():
    for i in range(3):
        print(region_names[i])
        print(f'X_train{region_names[i]}: {X_train[i].shape}')
        print(f'Y_train{region_names[i]}: {Y_train[i].shape}')
        print(f'y_train{region_names[i]}: {y_train[i].shape}')
        print()
        print(f'X_test{region_names[i]}: {X_test[i].shape}')
        print(f'Y_test{region_names[i]}: {Y_test[i].shape}')
        print(f'y_test{region_names[i]}: {y_test[i].shape}')
        print()

In [110]:
dimensionalityY()

A
X_trainA: (26819, 41)
Y_trainA: (26819, 3)
y_trainA: (26819, 1)

X_testA: (9085, 41)
Y_testA: (9085, 3)
y_testA: (9085, 1)

B
X_trainB: (8119, 34)
Y_trainB: (8119, 3)
y_trainB: (8119, 1)

X_testB: (2606, 34)
Y_testB: (2606, 3)
y_testB: (2606, 1)

C
X_trainC: (64771, 48)
Y_trainC: (64771, 3)
y_trainC: (64771, 1)

X_testC: (21876, 48)
Y_testC: (21876, 3)
y_testC: (21876, 1)



In [154]:
def linear_regression_preds(Y_train, y_train, Y_test):
    preds = []
    for Y_trn, y_trn, Y_tst in zip(Y_train, y_train, Y_test):
        preds.append(
            LinearRegression(fit_intercept=True).fit(Y_trn, y_trn).predict(Y_tst)
        )
    preds = np.array(preds, dtype = object)
    return preds

In [155]:
y_preds = linear_regression_preds(Y_train, y_train, Y_test)

In [160]:
for i in range(3):
    print(region_names[i])
    print(y_preds[i].shape)
    print(y_test[i].shape)
    print()

A
(9085, 1)
(9085, 1)

B
(2606, 1)
(2606, 1)

C
(21876, 1)
(21876, 1)



In [164]:
def get_mse(y_trues, y_preds):
    mse = []
    for y_true, y_pred in zip(y_trues, y_preds):
        mse.append(mean_squared_error(y_true, y_pred))
    mse = np.array(mse, dtype = object)
    return mse

In [166]:
mse = get_mse(y_test, y_preds)

In [167]:
mse

array([0.0031581082324556573, 0.002673794167959957, 0.0018657396656370543],
      dtype=object)

In [169]:
for i in range(3):
    print(region_names[i])
    print(mse[i])    
    print()    

A
0.0031581082324556573

B
0.002673794167959957

C
0.0018657396656370543



In [254]:
def best_prediction(X, y_trues, y_preds, index, nrow=10):
    err  = abs(y_trues[index] - y_preds[index])
    err  = err.values.ravel()
    args = err.argsort()
    for i in range(nrow):
        print(f'Row: {args[i]}, error: {err[args[i]]}')
    return X[index].iloc[args[:nrow]].transpose()

In [255]:
best_prediction(X_test, y_test, y_preds, 0, nrow=5)

Row: 1608, error: 5.44889575913865e-06
Row: 3890, error: 7.6305282379098e-06
Row: 387, error: 1.0708820749050219e-05
Row: 7390, error: 1.6395960858187064e-05
Row: 5022, error: 2.0358383663155767e-05


Unnamed: 0,1608,3890,387,7390,5022
bathroomcnt,2.0,3.0,2.5,2.5,1.0
bedroomcnt,4.0,3.0,3.0,2.0,2.0
calculatedbathnbr,2.0,3.0,2.5,2.5,1.0
calculatedfinishedsquarefeet,2049.0,2152.0,1250.0,1322.0,952.0
finishedsquarefeet12,2049.0,2152.0,1250.0,1322.0,952.0
fireplacecnt,0.0,0.0,0.0,0.0,0.0
latitude,33780710.0,33695910.0,33858360.0,33653900.0,33534120.0
longitude,117837200.0,117745400.0,117728400.0,117947000.0,117666900.0
lotsizesquarefeet,15000.0,7205.0,7205.0,7205.0,3300.0
rawcensustractandblock,60590760.0,60590520.0,60590220.0,60590640.0,60590320.0


In [256]:
best_prediction(X_test, y_test, y_preds, 1, nrow=5)

Row: 1892, error: 1.2318134620020738e-05
Row: 1238, error: 2.119314641759948e-05
Row: 694, error: 4.127586140760531e-05
Row: 1237, error: 5.0205249543973374e-05
Row: 670, error: 5.541832156980009e-05


Unnamed: 0,1892,1238,694,1237,670
bathroomcnt,2.5,2.0,2.5,1.5,2.0
bedroomcnt,3.0,3.0,3.0,3.0,3.0
calculatedbathnbr,2.5,2.0,2.5,1.5,2.0
calculatedfinishedsquarefeet,1547.0,1486.0,1803.0,1403.0,1334.0
finishedsquarefeet12,1547.0,1486.0,1803.0,1403.0,1334.0
fireplacecnt,1.0,1.0,1.0,0.0,1.0
latitude,34265360.0,34218780.0,34271420.0,34164630.0,34303740.0
longitude,118898900.0,118844900.0,118893300.0,118846400.0,118845200.0
lotsizesquarefeet,7205.0,10446.0,5224.0,23900.0,5874.0
rawcensustractandblock,61110080.0,61110080.0,61110080.0,61110060.0,61110080.0


In [257]:
best_prediction(X_test, y_test, y_preds, 2, nrow=5)

Row: 14256, error: 4.7254731469015206e-06
Row: 14079, error: 1.0073447930969426e-05
Row: 1982, error: 1.0692663301468952e-05
Row: 8856, error: 1.1020492149863004e-05
Row: 5023, error: 1.2599463038345804e-05


Unnamed: 0,14256,14079,1982,8856,5023
bathroomcnt,3.0,2.0,2.0,3.0,2.0
bedroomcnt,4.0,3.0,3.0,3.0,3.0
buildingqualitytypeid,8.0,6.0,4.0,8.0,6.0
calculatedbathnbr,3.0,2.0,2.0,3.0,2.0
calculatedfinishedsquarefeet,1884.0,1204.0,1070.0,1374.0,1287.0
finishedsquarefeet12,1884.0,1204.0,1070.0,1374.0,1287.0
latitude,34232310.0,34197250.0,33799000.0,33819470.0,34146180.0
longitude,118605400.0,118570400.0,118302000.0,118321200.0,118377700.0
lotsizesquarefeet,12315.0,8086.0,7205.0,40504.0,6500.0
rawcensustractandblock,60371130.0,60371350.0,60372930.0,60376510.0,60371440.0


In [260]:
def worst_prediction(X, y_trues, y_preds, index, nrow=10):
    err  = abs(y_trues[index] - y_preds[index])
    err  = err.values.ravel()
    args = err.argsort()[::-1]
    for i in range(nrow):
        print(f'Row: {args[i]}, error: {err[args[i]]}')
    return X[index].iloc[args[:nrow]].transpose()

In [261]:
worst_prediction(X_test, y_test, y_preds, 0, nrow=5)

Row: 5541, error: 1.04798431783384
Row: 8089, error: 0.9993323882959884
Row: 4406, error: 0.8565590125797439
Row: 2918, error: 0.8271780562889401
Row: 5097, error: 0.8135126414412008


Unnamed: 0,5541,8089,4406,2918,5097
bathroomcnt,2.0,1.5,2.0,1.0,1.0
bedroomcnt,3.0,3.0,3.0,3.0,4.0
calculatedbathnbr,2.0,1.5,2.0,1.0,1.0
calculatedfinishedsquarefeet,1250.0,1902.0,1260.0,1674.0,1256.0
finishedsquarefeet12,1524.0,1902.0,1260.0,1524.0,1524.0
fireplacecnt,0.0,0.0,0.0,0.0,0.0
latitude,33835660.0,33800820.0,33601900.0,33769070.0,33828550.0
longitude,117909400.0,117850800.0,117901000.0,117865800.0,117948400.0
lotsizesquarefeet,7307.0,6650.0,7205.0,12688.0,12544.0
rawcensustractandblock,60590870.0,60590760.0,60590630.0,60590760.0,60590870.0


In [262]:
worst_prediction(X_test, y_test, y_preds, 1, nrow=5)

Row: 2221, error: 0.46231316012917323
Row: 1393, error: 0.4594084165262182
Row: 298, error: 0.410917146456766
Row: 1109, error: 0.3590955388180627
Row: 2001, error: 0.32434329707233567


Unnamed: 0,2221,1393,298,1109,2001
bathroomcnt,2.5,2.0,2.0,1.0,4.0
bedroomcnt,3.0,5.0,3.0,1.0,4.0
calculatedbathnbr,2.5,2.0,2.0,1.0,4.0
calculatedfinishedsquarefeet,1605.0,1860.0,1120.0,600.0,3758.0
finishedsquarefeet12,1605.0,1860.0,1120.0,600.0,3758.0
fireplacecnt,1.0,1.0,0.0,1.0,2.0
latitude,34253480.0,34228590.0,34276520.0,34160700.0,34136560.0
longitude,118802600.0,119191900.0,118746700.0,119225500.0,118885600.0
lotsizesquarefeet,7205.0,6534.0,7300.0,2625.0,7205.0
rawcensustractandblock,61110080.0,61110030.0,61110080.0,61110040.0,61110070.0


In [263]:
worst_prediction(X_test, y_test, y_preds, 2, nrow=5)

Row: 18590, error: 0.9289787791126285
Row: 4291, error: 0.7267326722937435
Row: 6614, error: 0.5566632198702617
Row: 5840, error: 0.5299377073740877
Row: 20980, error: 0.4795486018804156


Unnamed: 0,18590,4291,6614,5840,20980
bathroomcnt,2.0,2.0,1.0,3.0,3.0
bedroomcnt,3.0,2.0,1.0,3.0,4.0
buildingqualitytypeid,7.0,4.0,7.0,4.0,4.0
calculatedbathnbr,2.0,2.0,1.0,3.0,3.0
calculatedfinishedsquarefeet,1417.0,1824.0,640.0,1782.0,3098.0
finishedsquarefeet12,1417.0,1524.0,640.0,1782.0,3098.0
latitude,34018710.0,33948900.0,34183500.0,34061190.0,34118260.0
longitude,118419200.0,118254000.0,118596000.0,118740700.0,118101000.0
lotsizesquarefeet,6344.0,2491.0,134581.0,10123.0,10462.0
rawcensustractandblock,60372720.0,60372410.0,60371350.0,60378000.0,60374640.0
