In [15]:
import os
import json
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score
from utils import MLPRegressor

In [16]:
def RMSE(a,b):
    a = np.array(a)
    b = np.array(b)
    if a.shape != b.shape:
        raise ValueError('RMSE input error')
    return np.mean((a-b)**2)**0.5


def RMSE_woo(a,b,threshold=20):
    a = np.array(a)
    b = np.array(b)
    if a.shape != b.shape:
        raise ValueError('RMSE input error')
    std = RMSE(a,b)
    outlier_flag = (np.abs(a-b) > std*threshold)
    num_outlier = np.sum(outlier_flag)
    
    return RMSE(a[~outlier_flag],b[~outlier_flag]), num_outlier
    

class LoadData(torch.utils.data.Dataset):

    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
          # Apply scaling if necessary
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X)
            self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]
    
class MLP(torch.nn.Module):
    def __init__(self, num_features, seed = 123):
        torch.manual_seed(seed)

        super().__init__()
        self.linear1 = torch.nn.Linear(num_features, 64)
        self.linear2 = torch.nn.Linear(64, 32)
        self.linear3 = torch.nn.Linear(32, 1)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        out = self.linear1(x.float())
        out = self.relu(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear3(out)
        return out
    
def ann(data_json, X_train, y_train, X_test, y_test, p, lr = 1e-4, num_epochs = 20):
    traindata = LoadData(X_train, y_train)
    trainloader = DataLoader(traindata, batch_size=10, shuffle=True)
    # Initialize the MLP
    mlp = MLP(p)

    # Define the loss function and optimizer
    loss_function = nn.MSELoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=lr)
        
    mlp.train()
    # Run the training loop
    for epoch in range(0, num_epochs): # 5 epochs at maximum

        # Print epoch
        # print(f'Starting epoch {epoch+1}')

        # Set current loss value
        current_loss = 0.0

        # Iterate over the DataLoader for training data
        for i, data in enumerate(trainloader, 0):

            # Get and prepare inputs
            inputs, targets = data
            inputs, targets = inputs.float(), targets.float()
            targets = targets.reshape((targets.shape[0], 1))

            # Zero the gradients
            optimizer.zero_grad()

            # Perform forward pass
            outputs = mlp(inputs)

            # Compute loss
            loss = loss_function(outputs, targets)

            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

            # Print statistics
            current_loss += loss.item()
            if i % 10 == 0:
                # print('Loss after mini-batch %5d: %.3f' % (i + 1, current_loss / 500))
                current_loss = 0.0

        # Process is complete.
    #print('Training process has finished.')

    mlp.eval()
    testdata = LoadData(X_test, y_test)
    testloader = DataLoader(testdata, batch_size=10, shuffle=True)
    inputs, targets = torch.tensor(X_test), torch.tensor(y_test)
    targets = targets.reshape((targets.shape[0], 1))
    # Perform forward pass
    outputs = mlp(inputs)

    # Compute loss
    loss = loss_function(outputs, targets)

    # data_json['ann_test_y'] = outputs.detach().numpy().reshape(-1).tolist()
    # if 'grid_x' in data_json.keys():
    #     inputs = torch.tensor(data_json['grid_x'])
    #     data_json['ann_grid_y'] = mlp(inputs).detach().numpy().reshape(-1).tolist()
        
    return loss.item() ** .5
    

## Medical Insurance

In [25]:
data_foler = './group_1/medical_insurance'
data_prefix = "insurance_"
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test.csv"), sep=",")
y_test = np.array(testing_df.iloc[:, -1])
print('size:',len(y_test))
print("denominator:",np.sum(y_test**2)**0.5)

size: 201
denominator: 269232.84699705313


In [19]:
data_foler = './group_1/medical_insurance'
Knn_k = 5

task_name = 'medical_insurace_regression'
data_prefix = "insurance_"
training_csv_list = [os.path.join(data_foler,data_prefix+"train_20_num.csv"),os.path.join(data_foler,data_prefix+"train_40_num.csv"),
                    os.path.join(data_foler,data_prefix+"train_60_num.csv"),os.path.join(data_foler,data_prefix+"train_80_num.csv"),
                    os.path.join(data_foler,data_prefix+"train_full_num.csv")]
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test_num.csv"), sep=",")

baseline_models = ['linear','poly','knn','nn']

all_RAE = []
all_R2 = []
all_RMSE = []

for training_file in training_csv_list:
    print('-------------------------------------------')
    print("file path: ",training_file)
    training_df = pd.read_csv(training_file, sep=",")
    num_cols = training_df.shape[1]
    X_train = np.array(training_df.iloc[:, 0:num_cols-1])

    y_train = np.array(training_df.iloc[:, -1])

    X_test= np.array(testing_df.iloc[:, 0:num_cols-1])
    y_test = np.array(testing_df.iloc[:, -1])

    model_RAE = []
    model_R2 = []
    model_RMSE = []
    for model_name in baseline_models:
        if model_name == "linear":
            model = linear_model.LinearRegression()
        elif model_name == "poly":
            model = linear_model.LinearRegression()
        elif model_name == "knn":
            model = KNeighborsRegressor(n_neighbors=Knn_k)
        elif model_name == "nn":
            model = MLPRegressor(random_state=1, max_iter=1000, hidden_layer_sizes = (50,50,50))
        model.fit(X_train, y_train)
        pred_y = model.predict(X_test)
        rmse = RMSE(y_test,pred_y)
        r2 = r2_score(y_test,pred_y)
        rae = np.sum((y_test-pred_y)**2)**0.5 / np.sum((y_test)**2)**0.5
        model_RAE.append(rae)
        model_R2.append(r2)
        model_RMSE.append(rmse)
        print(model_name,"RAE",rae)
        print(model_name,"R2 Score",r2)
        print(model_name,"RMSE",rmse)
    all_RAE.append(model_RAE)
    all_R2.append(model_R2)
    all_RMSE.append(model_RMSE)

all_RAE = pd.DataFrame(all_RAE,training_csv_list,baseline_models)
all_RAE.to_csv(os.path.join(data_foler,"baselines_RAE.csv"))
all_R2 = pd.DataFrame(all_R2,training_csv_list,baseline_models)
all_R2.to_csv(os.path.join(data_foler,"baselines_R2.csv"))
all_RMSE = pd.DataFrame(all_RMSE,training_csv_list,baseline_models)
all_RMSE.to_csv(os.path.join(data_foler,"baselines_RMSE.csv"))
    # # linear regression
    # linear_reg = linear_model.LinearRegression()
    # linear_reg.fit(X_train, y_train)
    # lr_test_y = linear_reg.predict(X_test)
    # lr_rmse = RMSE(y_test,lr_test_y)
    # lr_r2_score = r2_score(y_test,lr_test_y)
    # lr_rae = np.sum((y_test-lr_test_y)**2)**0.5 / np.sum((y_test)**2)**0.5
    # print("Linear Regression RAE",lr_rae)
    # print("Linear Regression R2 Score",lr_r2_score)
    

    # # lr_mse_test = RMSE(y_test,lr_test_y) 
    # print("Linear Regression RMSE",lr_rmse)

    # # poly regression
    # poly = PolynomialFeatures(2)
    # X_poly_train = poly.fit_transform(X_train)

    # poly_reg = linear_model.LinearRegression()
    # poly_reg.fit(X_poly_train, y_train)
    # poly_test_y = poly_reg.predict(poly.fit_transform(X_test))
    # poly_rmse = RMSE(y_test,poly_test_y)
    # poly_r2_score = r2_score(y_test,poly_test_y)
    # ploy_rae = np.sum((y_test-lr_test_y)**2)**0.5 / np.sum((y_test)**2)**0.5
    # print("Poly Regression RAE",ploy_rae)
    # print("Poly Regression R2 Score",poly_r2_score)
    # print("Poly Regression RMSE",poly_rmse)

    # # KNN regression
    # knn_reg = KNeighborsRegressor(n_neighbors=Knn_k)
    # knn_reg.fit(X_train, y_train)
    # knn_test_y = knn_reg.predict(X_test)
    # knn_rmse = RMSE(y_test,knn_test_y)
    # knn_r2_score = r2_score(y_test,knn_test_y)
    # knn_rae = np.sum((y_test-lr_test_y)**2)**0.5 / np.sum((y_test)**2)**0.5
    # print("Poly Regression RAE",ploy_rae)
    # print("KNN Regression R2 Score",knn_r2_score)
    # print("KNN RMSE",knn_rmse)
        
    # # ann 
    # # ann_rmse = ann(None, X_train, y_train, X_test, y_test, num_cols-1, lr = 1e-4, num_epochs = 20)
    # nn_reg = MLPRegressor(random_state=1, max_iter=1000, hidden_layer_sizes = (50,50,50))
    # nn_reg.fit(X_train, y_train)
    # nn_test_y = nn_reg.predict(X_test)

    # nn_r2_score = r2_score(y_test,nn_test_y)
    # nn_rmse = RMSE(y_test,nn_test_y)
    # print("NN Regression R2 Score",nn_r2_score)
    # print("NN RMSE",nn_rmse)

-------------------------------------------
file path:  ./group_1/medical_insurance\insurance_train_20_num.csv
linear RAE 0.33603434949547883
linear R2 Score 0.7534510961468066
linear RMSE 6143.833772307208
poly RAE 0.33603434949547883
poly R2 Score 0.7534510961468066
poly RMSE 6143.833772307208
knn RAE 0.6909987491549627
knn R2 Score -0.042534058435718114
knn RMSE 12633.772285643721




nn RAE 0.9184083591587624
nn R2 Score -0.8416512207343847
nn RMSE 16791.58187917563
-------------------------------------------
file path:  ./group_1/medical_insurance\insurance_train_40_num.csv
linear RAE 0.3395259695024923
linear R2 Score 0.7483008636502857
linear RMSE 6207.672284505026
poly RAE 0.3395259695024923
poly R2 Score 0.7483008636502857
poly RMSE 6207.672284505026
knn RAE 0.6912283312664629
knn R2 Score -0.04322693069316608
knn RMSE 12637.969815843448




nn RAE 0.4138510654225409
nn R2 Score 0.6260410906049609
nn RMSE 7566.584059831468
-------------------------------------------
file path:  ./group_1/medical_insurance\insurance_train_60_num.csv
linear RAE 0.32891581424454897
linear R2 Score 0.7637862147933486
linear RMSE 6013.68309768217
poly RAE 0.32891581424454897
poly R2 Score 0.7637862147933486
poly RMSE 6013.68309768217
knn RAE 0.6310653469417801
knn R2 Score 0.1304703509632168
knn RMSE 11537.988889810917
nn RAE 0.3190410899089323
nn R2 Score 0.7777565501599809
nn RMSE 5833.140052137947
-------------------------------------------
file path:  ./group_1/medical_insurance\insurance_train_80_num.csv
linear RAE 0.3284780272289139
linear R2 Score 0.7644145975183659
linear RMSE 6005.678884256446
poly RAE 0.3284780272289139
poly R2 Score 0.7644145975183659
poly RMSE 6005.678884256446
knn RAE 0.5882820291680475
knn R2 Score 0.24437400008777055
knn RMSE 10755.766497890994
nn RAE 0.3151358334674348
nn R2 Score 0.7831640401169073
nn RMSE 5761

## Servo

In [26]:
data_foler = './group_1/servo'
data_prefix = "servo_"
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test.csv"), sep=",")
y_test = np.array(testing_df.iloc[:, -1])
print('size:',len(y_test))
print("denominator:",np.sum(y_test**2)**0.5)

size: 26
denominator: 11.450392934182057


In [21]:
data_foler = './group_1/servo'
Knn_k = 5

task_name = 'servo_regression'
data_prefix = "servo_"
training_csv_list = [os.path.join(data_foler,data_prefix+"train_20_num.csv"),os.path.join(data_foler,data_prefix+"train_40_num.csv"),
                    os.path.join(data_foler,data_prefix+"train_60_num.csv"),os.path.join(data_foler,data_prefix+"train_80_num.csv"),
                    os.path.join(data_foler,data_prefix+"train_full_num.csv")]
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test_num.csv"), sep=",")

baseline_models = ['linear','poly','knn','nn']

all_RAE = []
all_R2 = []
all_RMSE = []

for training_file in training_csv_list:
    print('-------------------------------------------')
    print("file path: ",training_file)
    training_df = pd.read_csv(training_file, sep=",")
    num_cols = training_df.shape[1]
    X_train = np.array(training_df.iloc[:, 0:num_cols-1])

    y_train = np.array(training_df.iloc[:, -1])

    X_test= np.array(testing_df.iloc[:, 0:num_cols-1])
    y_test = np.array(testing_df.iloc[:, -1])

    model_RAE = []
    model_R2 = []
    model_RMSE = []
    for model_name in baseline_models:
        if model_name == "linear":
            model = linear_model.LinearRegression()
        elif model_name == "poly":
            model = linear_model.LinearRegression()
        elif model_name == "knn":
            model = KNeighborsRegressor(n_neighbors=Knn_k)
        elif model_name == "nn":
            model = MLPRegressor(random_state=1, max_iter=1000, hidden_layer_sizes = (50,50,50))
        model.fit(X_train, y_train)
        pred_y = model.predict(X_test)
        rmse = RMSE(y_test,pred_y)
        r2 = r2_score(y_test,pred_y)
        rae = np.sum((y_test-pred_y)**2)**0.5 / np.sum((y_test)**2)**0.5
        model_RAE.append(rae)
        model_R2.append(r2)
        model_RMSE.append(rmse)
        print(model_name,"RAE",rae)
        print(model_name,"R2 Score",r2)
        print(model_name,"RMSE",rmse)
    all_RAE.append(model_RAE)
    all_R2.append(model_R2)
    all_RMSE.append(model_RMSE)

all_RAE = pd.DataFrame(all_RAE,training_csv_list,baseline_models)
all_RAE.to_csv(os.path.join(data_foler,"baselines_RAE.csv"))
all_R2 = pd.DataFrame(all_R2,training_csv_list,baseline_models)
all_R2.to_csv(os.path.join(data_foler,"baselines_R2.csv"))
all_RMSE = pd.DataFrame(all_RMSE,training_csv_list,baseline_models)
all_RMSE.to_csv(os.path.join(data_foler,"baselines_RMSE.csv"))

-------------------------------------------
file path:  ./group_1/servo\servo_train_20_num.csv
linear RAE 0.603575949799394
linear R2 Score 0.3813294404888924
linear RMSE 1.355394262054033
poly RAE 0.603575949799394
poly R2 Score 0.3813294404888924
poly RMSE 1.355394262054033
knn RAE 0.8547560981638355
knn R2 Score -0.2407377977492029
knn RMSE 1.9194461132720884
nn RAE 0.7801814511491897
nn R2 Score -0.03368172539309833
nn RMSE 1.7519807782269292
-------------------------------------------
file path:  ./group_1/servo\servo_train_40_num.csv
linear RAE 0.6303691598095078
linear R2 Score 0.3251837811557752
linear RMSE 1.415561276862669
poly RAE 0.6303691598095078
poly R2 Score 0.3251837811557752
poly RMSE 1.415561276862669
knn RAE 0.8998348900182699
knn R2 Score -0.3750587492152486
knn RMSE 2.0206753551597676
nn RAE 0.7400322418678704
nn R2 Score 0.06997016651080767
nn RMSE 1.6618214405263578
-------------------------------------------
file path:  ./group_1/servo\servo_train_60_num.csv
li

## ccpp

In [24]:
data_foler = './group_1/CCPP'
data_prefix = "ccpp_"
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test.csv"), sep=",")
y_test = np.array(testing_df.iloc[:, -1])
print('size:',len(y_test))
print("denominator:",np.sum(y_test**2)**0.5)

size: 1436
denominator: 17239.39922523694


In [22]:
data_foler = './group_1/CCPP'
Knn_k = 5

task_name = 'ccpp_regression'
data_prefix = "ccpp_"
training_csv_list = [os.path.join(data_foler,data_prefix+"train_20.csv"),os.path.join(data_foler,data_prefix+"train_40.csv"),
                    os.path.join(data_foler,data_prefix+"train_60.csv"),os.path.join(data_foler,data_prefix+"train_80.csv"),
                    os.path.join(data_foler,data_prefix+"train_full.csv")]
testing_df = pd.read_csv(os.path.join(data_foler,data_prefix+"test.csv"), sep=",")

baseline_models = ['linear','poly','knn','nn']

all_RAE = []
all_R2 = []
all_RMSE = []

for training_file in training_csv_list:
    print('-------------------------------------------')
    print("file path: ",training_file)
    training_df = pd.read_csv(training_file, sep=",")
    num_cols = training_df.shape[1]
    X_train = np.array(training_df.iloc[:, 0:num_cols-1])

    y_train = np.array(training_df.iloc[:, -1])

    X_test= np.array(testing_df.iloc[:, 0:num_cols-1])
    y_test = np.array(testing_df.iloc[:, -1])

    model_RAE = []
    model_R2 = []
    model_RMSE = []
    for model_name in baseline_models:
        if model_name == "linear":
            model = linear_model.LinearRegression()
        elif model_name == "poly":
            model = linear_model.LinearRegression()
        elif model_name == "knn":
            model = KNeighborsRegressor(n_neighbors=Knn_k)
        elif model_name == "nn":
            model = MLPRegressor(random_state=1, max_iter=1000, hidden_layer_sizes = (50,50,50))
        model.fit(X_train, y_train)
        pred_y = model.predict(X_test)
        rmse = RMSE(y_test,pred_y)
        r2 = r2_score(y_test,pred_y)
        rae = np.sum((y_test-pred_y)**2)**0.5 / np.sum((y_test)**2)**0.5
        model_RAE.append(rae)
        model_R2.append(r2)
        model_RMSE.append(rmse)
        print(model_name,"RAE",rae)
        print(model_name,"R2 Score",r2)
        print(model_name,"RMSE",rmse)
    all_RAE.append(model_RAE)
    all_R2.append(model_R2)
    all_RMSE.append(model_RMSE)

all_RAE = pd.DataFrame(all_RAE,training_csv_list,baseline_models)
all_RAE.to_csv(os.path.join(data_foler,"baselines_RAE.csv"))
all_R2 = pd.DataFrame(all_R2,training_csv_list,baseline_models)
all_R2.to_csv(os.path.join(data_foler,"baselines_R2.csv"))
all_RMSE = pd.DataFrame(all_RMSE,training_csv_list,baseline_models)
all_RMSE.to_csv(os.path.join(data_foler,"baselines_RMSE.csv"))

-------------------------------------------
file path:  ./group_1/CCPP\ccpp_train_20.csv
linear RAE 0.010209385392908096
linear R2 Score 0.9252270701412878
linear RMSE 4.644559227220214
poly RAE 0.010209385392908096
poly R2 Score 0.9252270701412878
poly RMSE 4.644559227220214
knn RAE 0.027764764560993468
knn R2 Score 0.4469897362193166
knn RMSE 12.631033942840192
nn RAE 0.014052176321352047
nn R2 Score 0.8583448453261087
nn RMSE 6.392761433141501
-------------------------------------------
file path:  ./group_1/CCPP\ccpp_train_40.csv
linear RAE 0.010199256597223323
linear R2 Score 0.9253753619354507
linear RMSE 4.639951330696785
poly RAE 0.010199256597223323
poly R2 Score 0.9253753619354507
poly RMSE 4.639951330696785
knn RAE 0.02221820768075518
knn R2 Score 0.6458695710352578
knn RMSE 10.107736903303662
nn RAE 0.012746319256613249
nn R2 Score 0.8834493269816347
nn RMSE 5.798687427112068
-------------------------------------------
file path:  ./group_1/CCPP\ccpp_train_60.csv
linear RAE