In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import statsmodels.api as st
import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv('wide.csv')
df = df.loc[df['max_bid'] >= 0]
df

Unnamed: 0,session,lot_order,lot_order_norm,lot_num,lot_id,estim,res_i,result,win_bid,win_firm,...,ln_dres_mean_bid,ln_pres_min_bid,ln_pres_max_bid,ln_pres_mean_bid,ln_random_bid_1,ln_random_bid_2,ln_random_bid_3,ln_random_bid_4,ln_random_bid_5,ln_random_bid_6
0,1,1,0.008197,35069,03200400A7,25323,24000,O,23069.0,11.0,...,8.312544,0.309199,0.038058,0.156778,10.007893,10.046288,9.943093,9.779454,9.635935,9.928814
1,1,2,0.016393,35070,03200200A7,9203,9000,O,8400.0,131.0,...,7.392032,0.261338,0.064539,0.165703,9.036106,8.977273,8.755738,8.750366,,
2,1,3,0.024590,35071,03100200A7,33921,28000,O,37800.0,75.0,...,,0.289466,-0.430783,-0.051381,10.367473,9.830971,,,,
5,1,6,0.049180,35074,03100100A7,12710,4500,O,15706.0,106.0,...,,,,,9.457825,9.236398,9.424887,9.479069,9.661862,9.629116
6,1,7,0.057377,35075,03200600A7,24790,24807,O,24807.0,5.0,...,8.454608,0.388075,0.000000,0.173355,10.045464,9.904537,9.605957,9.476237,10.064798,9.885884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2251,10,220,0.648968,30682,03016300A3,20004,19000,O,20200.0,26.0,...,6.484126,0.092915,-0.065241,0.033825,9.776222,9.749812,9.852194,9.913487,9.791774,9.811153
2254,10,223,0.657817,30685,02430136A3,4649,3500,N,,,...,6.543912,0.228842,0.130527,0.180892,7.863651,8.010692,,,,
2255,10,224,0.660767,30686,02029400A3,7653,8855,N,,,...,7.444638,0.425035,0.050858,0.176533,8.909370,8.958154,8.334712,8.955061,9.035272,8.909100
2256,10,225,0.663717,30687,02029500A3,9536,8695,O,10050.0,100.0,...,,0.062632,-0.169409,-0.062442,9.126306,9.171392,9.130972,9.116030,9.003808,9.215427


In [3]:
y = df.max_bid
df_1 = df[['surf', 'ln_n_trees', 'v_crown', 'v_stem', 'v_stump',
         'v_stem_other_foliar', 'v_stem_other_conifer', 'v_stem_oak',
         'v_stem_spruce', 'v_stem_beech', 'v_stem_pine', 'v_stem_fir',
         'v_crown_foliar', 'v_crown_conifer', 'lot_order', 'herf1']]
cat_cols = ['stand', 'cut', 'grape', 'land_area', 'quality', 'conditions']
X = df_1.join(pd.get_dummies(df[cat_cols], drop_first=True))
X['const'] = 1
X.shape

(1768, 37)

In [4]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X, y)
X_res = pd.DataFrame(data={'feature': X.columns, 'Coeff-s': lin_reg.coef_.round(2)})

In [5]:
from sklearn.linear_model import Lasso

las = Lasso()
las.fit(X, y)
X_res['Coeff-s Lasso'] = pd.DataFrame(las.coef_.round(2))

In [6]:
from sklearn.linear_model import ElasticNet

eNet = ElasticNet()
eNet.fit(X, y)
X_res['Coeff-s eNet'] = pd.DataFrame(eNet.coef_.round(2))

In [7]:
X_res

Unnamed: 0,feature,Coeff-s,Coeff-s Lasso,Coeff-s eNet
0,surf,-124.45,-124.07,-177.79
1,ln_n_trees,-2440.68,-2441.47,-966.88
2,v_crown,6.62,53.97,50.34
3,v_stem,-51.0,35.08,30.51
4,v_stump,-85.98,-86.34,-95.32
5,v_stem_other_foliar,70.97,-15.11,-16.71
6,v_stem_other_conifer,74.89,-11.22,-12.27
7,v_stem_oak,113.59,27.53,29.95
8,v_stem_spruce,86.18,0.07,1.42
9,v_stem_beech,87.29,1.21,3.07


### In-sample check

In [8]:
from sklearn.metrics import r2_score

LR = lin_reg.predict(X)
LS = las.predict(X)
EN = eNet.predict(X)

print("Linear Regression r^2 on train data : %f" % r2_score(y, LR))
print("Lasso Regression r^2 on train data : %f" % r2_score(y, LS))
print("Elasic Net Regression r^2 on train data : %f" % r2_score(y, EN))

Linear Regression r^2 on train data : 0.759172
Lasso Regression r^2 on train data : 0.759165
Elasic Net Regression r^2 on train data : 0.721414


### Out-of-sample check

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lin_reg.fit(X_train, y_train)
las.fit(X_train, y_train)
eNet.fit(X_train, y_train)

LR = lin_reg.predict(X_test)
LS = las.predict(X_test)
EN = eNet.predict(X_test)

print("Linear Regression r^2 on test data : %f" % r2_score(y_test, LR))
print("Lasso Regression r^2 on test data : %f" % r2_score(y_test, LS))
print("Elasic Net Regression r^2 on test data : %f" % r2_score(y_test, EN))

Linear Regression r^2 on test data : 0.690100
Lasso Regression r^2 on test data : 0.690440
Elasic Net Regression r^2 on test data : 0.660042


### Elasic Net hyperparametrs tuning

In [10]:
#in-sample hyperparametrs
from sklearn.model_selection import GridSearchCV


params = {'alpha': np.linspace(0, 2, 15),
          'l1_ratio': np.linspace (0, 1, 10)}
cv1 = GridSearchCV(eNet,
                  params,
                  scoring='r2',
                  cv=5
                 )
cv1.fit(X, y)

print(cv1.best_params_)

{'alpha': 0.2857142857142857, 'l1_ratio': 0.8888888888888888}


Here different launchings showed different best params, but in nearly half of tries gridsearch selected alpha=2 and l1_ratio=1

In [16]:
eNet = ElasticNet(alpha=2, l1_ratio=1)
eNet.fit(X, y)
EN1 = eNet.predict(X)
print("Elasic Net Regression r^2 on train data : %f" % r2_score(y, EN1))

Elasic Net Regression r^2 on train data : 0.759146


In [12]:
#out-of-sample hyperparametrs
from sklearn.model_selection import GridSearchCV


cv2 = GridSearchCV(eNet,
                  params,
                  scoring='r2',
                  cv=5
                 )
cv2.fit(X_train, y_train)

print(cv2.best_params_)

{'alpha': 2.0, 'l1_ratio': 1.0}


In [14]:
eNet = ElasticNet(alpha=2, l1_ratio=1)
eNet.fit(X_train, y_train)
EN2 = eNet.predict(X_test)
print("Elasic Net Regression r^2 on test data : %f" % r2_score(y_test, EN2))

Elasic Net Regression r^2 on test data : 0.690748


### Interpretation of the results

As we can see in the coefficient table, results are logical: Lasso has smaller cofficients than linear regression without regularization and Elasic net has smaller coefficients than even Lasso, as it penalizes the model for large weights much stronger. What's more, we can obsevre that Lasso indeed zeroes some coefficients. <br /> 
<br />
The usual linear regression shows the best in-sample results, which is logical, because nothing prevents the coefficients from overfitting, adjusting to the data in any way and getting best estimates. The lasso shows a slightly worse result, because now the model is penalized for large coefficients. The elasic net showed the worst results within the sample, because there were even more restrictions. <br />
<br />
When checking models on data not from the sample, the lasso becomes the best model, and the elastic network still shows not the best result, which is probably due to using a model out of the box with default hyperparameters. Or may be there is not enough data for this model to fit in a correct way. <br /> 
<br />
With the help of tuning hyperparameters, model predictions improve both in-sample and out-of-sample results, which is logical, because we compare different models from this family and pick the best one. Moreover, Elasic Net becomes the best model to make predictions. What's interesting, it falls to one edge: in fact, we use Lasso, as optimal l1_ratio is equal to 1. The only difference from Lasso is that regularization coefficient (alpha=2) is larger than in default Lasso from the box (alpha=1).

In [21]:
X_res['Coeff-s eNet New'] = pd.DataFrame(eNet.coef_.round(2))
X_res.loc[abs(X_res['Coeff-s']) > 2000, ['feature']]

Unnamed: 0,feature
1,ln_n_trees
22,cut_R
24,grape_M2
25,grape_M3
26,land_area_N
29,quality_MA
31,quality_TB
34,conditions_ADTD


Here are the list of the most important feature, that influence max_bid