In [5]:
import csv
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import BayesianRidge
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
import random as r
r.seed(1)

def load_data():
    X_train_dt = pd.read_csv('train_t0.csv')
    X_train = np.asarray(X_train_dt)[:,1:].astype(np.float32)
    
    y_train_dt = pd.read_csv('train_t1.csv')
    y_train = np.asarray(y_train_dt)[:,1:].astype(np.float32)

    X_test_dt = pd.read_csv('test_t0.csv')
    X_test = np.asarray(X_test_dt)[:,1:].astype(np.float32)
    return X_train,y_train,X_test

def write_output(predictions):
    with open('submission.csv', mode='w') as output_file:
        output_writer = csv.writer(output_file, delimiter=',')
        output_writer.writerow(["ID", "Predicted"])

        for i in range(len(predictions)):
            output_writer.writerow([i, predictions[i]])

class CustomMultiOutputBayesianRidge(BaseEstimator, RegressorMixin):
    
    def __init__(self,alpha_1= 1e-06,alpha_2= 1e-06,alpha_init= None,compute_score= False,copy_X= True,fit_intercept= True,lambda_1= 1e-06,lambda_2= 1e-06,lambda_init= None,n_iter= 300,normalize= False,tol= 0.001,verbose= False):    
        self.alpha_1 = alpha_1
        self.alpha_2 = alpha_2
        self.alpha_init = alpha_init
        self.compute_score = compute_score
        self.copy_X = copy_X
        self.fit_intercept = fit_intercept
        self.lambda_1 = lambda_1
        self.lambda_2 = lambda_2
        self.lambda_init = lambda_init
        self.n_iter = n_iter
        self.normalize = normalize
        self.tol = tol
        self.verbose = verbose
        
    def fit(self, X, y):
        
        """
        A list of learning model is trained for each feature by using train data set.
        Bayesian Ridge Regressor is preferred.
        
        Parameters
        ----------
        X: train dataset with k features
        y: output of X dataset for each features(k feature)
        """
        
        self.model = []
        for i in range(y.shape[1]):
            self.model.append(BayesianRidge(alpha_1=self.alpha_1,alpha_2= self.alpha_2,alpha_init= self.alpha_init,compute_score= self.compute_score,copy_X= self.copy_X,fit_intercept= self.fit_intercept,lambda_1= self.lambda_1,lambda_2=self.lambda_2,lambda_init= self.lambda_init,n_iter= self.n_iter,normalize= self.normalize,tol= self.tol,verbose= self.verbose).fit(np.transpose(np.array([X[:,i]])),y[:,i]))
        return self
    
    def predict(self, X):
        
        """
            Predicts given X dataset with k features. 
            For each feature different model is used trained with fit.
            
        Parameters
        ----------
        X: test dataset with k features
        """
        
        y_test = np.zeros((X.shape[0],len(self.model)))
        for i in range(len(self.model)):
            y_test[:,i] = self.model[i].predict(np.transpose(np.array([X[:,i]]))) 
        return y_test
    

In [6]:
def test(model,X, y):
    """
        Predicts given X dataset with given model 
        Returns mse score between predicted output and ground truth output
        
    Parameters
    ----------
    X: Test dataset with k features
    y: Ground truth of X with k features
    """

    return mse(model.predict(X).flatten(),y.flatten())

In [7]:
def remove_outliers(X_train,y_train):          
    clf = LocalOutlierFactor(n_neighbors=2)
    out1 = clf.fit_predict(X_train)
    out2 = clf.fit_predict(y_train)
    indexes = np.argwhere(out1+out2 != 2)
    X_train = np.delete(X_train,indexes,axis=0)
    y_train = np.delete(y_train,indexes,axis=0)
    return X_train,y_train

In [8]:
# *************** MAIN PROGRAM NOTEBOOK *************** #

X_train,y_train,X_test = load_data()
X_train,y_train = remove_outliers(X_train,y_train)

reg = CustomMultiOutputBayesianRidge(alpha_2=7, lambda_1=0.19,lambda_2=1.75,alpha_init=0.5,lambda_init=0.1,n_iter=10,normalize=True)

#Parameter Estimation
param_grid = {
#        'clf__alpha_1':np.arange(1,6,0.5),
#        'clf__gamma': [0.0001,0.001,0.01,0.1,1],
#        'clf__tol':[0.0001,0.001,0.01,0.1,1],
#        'clf__alpha_1':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__alpha_2':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__lambda_1':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__lambda_2':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
}

search = GridSearchCV(reg, param_grid,cv=5,scoring=test,return_train_score=True)
search.fit(X_train, y_train)

print("5-Fold Cross Validation MSE ",search.best_score_)

write_output(search.predict(X_test).flatten())

# *************** MAIN PROGRAM NOTEBOOK *************** #

5-Fold Cross Validation MSE  0.0018956366281850357


In [9]:
# *************** MAIN PROGRAM SCRIPT *************** #
# def main():
#     args = sys.argv[1:]
#     t0 = args[0]
#     t1 = args[1]
#     args = sys.argv[1:]
#     X = load_data_submit(t0)
#     y = load_data_submit(t1)

#     X,y = remove_outliers(X,y)

#     reg = CustomMultiOutputBayesianRidge(alpha_2=7, lambda_1=0.19,lambda_2=1.75,alpha_init=0.5,lambda_init=0.1,n_iter=10,normalize=True)

#Parameter Estimation
##    param_grid = {
#        'clf__alpha_1':np.arange(1,6,0.5),
#        'clf__gamma': [0.0001,0.001,0.01,0.1,1],
#        'clf__tol':[0.0001,0.001,0.01,0.1,1],
#        'clf__alpha_1':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__alpha_2':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__lambda_1':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
#        'clf__lambda_2':[0.000001, 0.00001,0.0001,0.001,0.01,0.1,1],
##    }
    
#     search = GridSearchCV(reg, param_grid,cv=5,scoring=test)
#     search.fit(X, y)
#     write_output("predictions.csv",search.predict(X).flatten())
#     print("MSE Score ->",test(search,X,y))
#     return test(search,X,y)

# if __name__ == "__main__":
#     main()

# *************** MAIN PROGRAM SCRIPT *************** #