In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
import numpy as np
import sys

lambda_input = int('0')
sigma2_input = float('3')
X_train = np.genfromtxt("../input/hw1-regression-test/X_train.csv", delimiter = ",")
y_train = np.genfromtxt("../input/hw1-regression-test/y_train.csv")
X_test = np.genfromtxt("../input/hw1-regression-test/X_test.csv", delimiter = ",")

'''lambda_input = int(sys.argv[1])
sigma2_input = float(sys.argv[2])
X_train = np.genfromtxt(sys.argv[3], delimiter = ",")
y_train = np.genfromtxt(sys.argv[4])
X_test = np.genfromtxt(sys.argv[5], delimiter = ",")'''

## Solution for Part 1: Ridge regression
def part1(X_train, y_train, lambda_input):
    '''Function returning the Ridge Regression coefficients given:
        - observations (X_train, y_train) 
        - hyperparmeter lambda 
    ## Input : Arguments to the function 
        X_train = Covariates. Each row is a single vector xi. 
            Last dimension has already been set equal to 1 for all data.
        y_train = Measured outputs. Each row has a single number and the i-th 
            row of this file combined with the i-th row of X_train constitutes 
            the training pair (xi,yi).
        lambda_input = hyperparameter lambda value of the Ridge Regression
    ## Return : wRR, Final list of Ridge Regression coefficients values to write in the csv. file'''
    d = np.shape(X_train)[1]
    I = np.eye(d)
    wRR = np.dot(np.dot(np.linalg.inv(lambda_input * I + np.dot(X_train.T, X_train)), X_train.T), y_train)
    return wRR

wRR = part1(X_train, y_train, lambda_input)  # Assuming wRR is returned from the function
np.savetxt("wRR_" + str(lambda_input) + ".csv", wRR, delimiter="\n") # write output to file

## Solution for Part 2: Active learning
def part2(X_train, X_test, lambda_input, sigma2_input):
    '''Function implementing Active Learning algorithm:
        - Given X_train and X_test set
        - Pick X_test location that maximizes uncertainity
        - Update X_train with the selected location
        - Re-iterate the calculation to select the next location
    ## Input : Arguments to the function
        X_train = Covariates. Each row is a single vector xi. 
            Last dimension has already been set equal to 1 for all data.
        X_test = same structure as X_train
        lambda_input = hyperparameter lambda value of the Ridge Regression
        sigma2_input = noise
    ## Return : active, Final list of X_test locations to write in the file'''
    # X_train = X_train[:,1:]
    # X_test = X_test[:,1:]
    X_test_idx_list = np.array(list(range(1, np.shape(X_test)[0] + 1))) # Creates a unique ID for each row in X_test
    X_test_idx_list.shape = (X_test_idx_list.shape[0], 1)
    X_test = np.hstack((X_test, X_test_idx_list)) # Puts the unique row ID's in the last column of X_test
        
    d = np.shape(X_train)[1] # number of dimensions in the data set
    
    active = [] # Initialize active (list of rows to select from X_test)
    
    for i in range(10):
        X_train_cov_matrix = np.linalg.inv(lambda_input * np.eye(np.shape(X_train)[1]) + (1 / sigma2_input) * np.dot(X_train.T, X_train))
        sigma02 = [] # Initialization of the list of sigma values calculated for each row in X_test
        
        for k in range(np.shape(X_test)[0]):
            sigma02.append(sigma2_input + np.dot(np.dot(X_test[k][list(range(d))].T, X_train_cov_matrix), X_test[k][list(range(d))]))
        
        next_obs_idx = np.argmax(np.array(sigma02)) # Select X_test row with largest uncertainity location (max sigma02 value)
        active.append(int(X_test[next_obs_idx, d])) # Update active: Saving the X_test unique row ID in list active 
        X_train = np.vstack((X_train, X_test[next_obs_idx][list(range(d))])) # Update X_train: Add the selected row from X_test to X_train
        X_test = np.delete(X_test, next_obs_idx, axis=0) # Update X_test: Delete the selected row from X_test
    return active

active = part2(X_train, X_test, lambda_input, sigma2_input)  # Assuming active is returned from the function
np.savetxt("active_" + str(lambda_input) + "_" + str(int(sigma2_input)) + ".csv", active, delimiter=",") # write output to file

In [0]:
print('wRR_' + str(lambda_input) + ' =\n' , wRR)

print("active_" + str(lambda_input) + "_" + str(int(sigma2_input)) + ' =\n', active)