In [1]:
import numpy as np
import pandas as pd 
import sklearn as sk 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.kernel_ridge import KernelRidge 
from sklearn.linear_model import Lasso

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset_train = pd.read_csv('train.csv')
X = dataset_train.iloc[:, 1:11]
Y = dataset_train.iloc[:, 11]

X['climate_zone'] = X['Soil_Type'].astype(str).str[0] 
X['geologic'] = X['Soil_Type'].astype(str).str[1] 
X['soil'] = X['Soil_Type'] % 100
X['soil'] = X['soil'].astype('category')
X['climate_zone'] = X['climate_zone'].astype('category')
X['geologic'] = X['geologic'].astype('category')
X = X.drop(columns =['Soil_Type'])

X.head(3)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,climate_zone,geologic,soil
0,3106,347,5,42,1,1400,210,231,160,7,2,2
1,3132,121,1,0,0,466,221,237,152,7,1,1
2,3255,69,13,162,16,870,233,214,110,7,7,56


In [3]:
scaler = StandardScaler().fit(np.array(X.iloc[:,:9]))

X_std = scaler.transform(np.array(X.iloc[:,:9]))
X_std = pd.DataFrame(X_std)
X_std_full = pd.concat([X_std,X.iloc[:,9::]],axis = 1)
X_std_full = pd.get_dummies(X_std_full)

X_std_full.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,climate_zone_4,...,soil_3,soil_31,soil_45,soil_55,soil_56,soil_57,soil_58,soil_71,soil_72,soil_76
0,-1.313926,1.322331,-1.264149,-1.238966,-0.798534,0.664024,0.066913,0.228827,0.210945,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.053345,-0.639295,-1.884905,-1.434628,-0.815591,-0.951719,0.480502,0.612582,-0.014124,0,...,0,0,0,0,0,0,0,0,0,0
2,0.179402,-1.090643,-0.022638,-0.679932,-0.542678,-0.252832,0.93169,-0.858478,-1.195732,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
poly = PolynomialFeatures(interaction_only = True, include_bias=False)
X_std_full_poly = poly.fit_transform(X_std_full)
X_std_full_poly = pd.DataFrame(X_std_full_poly)

### Models 


In [5]:
def  ridgeReg(x,y,x1):
	clf = KernelRidge(alpha =0.4)
	clf.fit(x,y)

	predict = clf.predict(x1)

	return(predict)

In [12]:
def lassoReg(x,y,x1):
	clf = Lasso(alpha=0.1,max_iter =10000)
	clf.fit(x,y)

	predict = clf.predict(x1)

	return(predict)

### Score 


In [7]:
def RMse(y,y1):
	# y, y1 are 1xn
	n = y.shape[0]
	mse = np.square(y-y1).sum()/n

	return(np.sqrt(mse))

### Cross validation


In [8]:
def ten_rerun(model, xtr, ytr):
    rmse =[]
    for i in range(10):
        rmse_recorder =[]
        kf= KFold(n_splits =10, random_state = i, shuffle= True)
        for train_index, test_index in kf.split(xtr):
            pred= model(xtr.iloc[train_index], ytr.iloc[train_index],xtr.iloc[test_index])
            rmse_recorder.append(RMse(pred,ytr.iloc[test_index]))
        rmse.append(np.mean(np.array(rmse_recorder)))
        
    return(np.array(rmse))
        


In [9]:
ridge_rmse = ten_rerun(ridgeReg,X_std_full_poly,Y)
print(ridge_rmse)

[744.54518893 743.94496979 743.20235237 744.68447601 743.40931103
 744.91749951 744.36844148 745.35759514 744.56043964 743.8391769 ]


In [13]:
lasso_rmse = ten_rerun(lassoReg,X_std_full_poly,Y)
print(lasso_rmse)

[745.61784133 744.97159653 744.20888253 745.79008959 744.44027315
 746.11235198 745.36225059 746.12352831 745.52459137 744.68873536]


Choose wilocxon sign -ranked test, which do not have strong assumption over the data, and is more powerful than sign test. <br>

$H_0:both ~odels~ have~ same~ performance$ VS $H_a: \lnot H_0$

In [14]:
print(lasso_rmse-ridge_rmse)

[1.07265239 1.02662674 1.00653017 1.10561358 1.03096213 1.19485247
 0.99380911 0.76593317 0.96415174 0.84955845]


| Ridge        | Lasso        | Difference | sign | Rank |
|--------------|--------------|------------|------|------|
| 744.54518893 | 745.61784133 | 1.07265239 | +    | 8    |
| 743.94496979 | 744.97159653 | 1.02662674 | +    | 6    |
| 743.20235237 | 744.20888253 | 1.00653017 | +    | 5    |
| 744.68447601 | 745.79008959 | 1.10561358 | +    | 9    |
| 743.40931103 | 744.44027315 | 1.03096213 | +    | 7    |
| 744.91749951 | 746.11235198 | 1.19485247 | +    | 10   |
| 744.36844148 | 745.36225059 | 0.99380911 | +    | 4    |
| 745.35759514 | 746.12352831 | 0.76593317 | +    | 1    |
| 744.56043964 | 745.52459137 | 0.96415174 | +    | 3    |
| 743.8391769  | 744.68873536 | 0.84955845 | +    | 2    |

$W_{ridge}=55$ and $W_{Lasso} =0$ <br>
$T_{wilcox} =min(W_{ridge},W_{Lasso})$ <br>
For n= 10-1 degrees of freedom and $\alpha$ = 0.005, V = 5 for the 2-sided test. V must be larger than
TWilcox in order to reject the hypothesis. Since 0 < 5, we have sufficient evidence to reject the hypothesis that Lasso’sperformance is equal to that of ridge at the 0.005 level.