Loading basic libraries

In [None]:
import pandas as pd
import numpy as np


Importing data set

In [None]:
X_test = pd.read_csv('data_test.csv')
X_train = pd.read_csv('data_train.csv')


Basic data review

In [None]:
X_train.head()

In [None]:
print("Number of variables: ", len(X_train.columns), "\nNumber of observations: ", len(X_train.index))
X_train.info()


We can see that all data is numerical.

Are there any missing data?

In [None]:
X_train.isnull().sum().sort_values()

No data missing. Now we are separating data set.

In [None]:
y = X_train.Y
X_train.drop(['Y'], axis=1, inplace=True)

From prevoius cells we can see that values are not standardized.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_train_std = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(X_train_std, index=X_train.index, columns=X_train.columns)
X_train.head()


We are ready to see how ElasticNet model is performing on our data set.

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from heapq import nsmallest
import matplotlib.pylab as plt
import seaborn as sns

NOTE: from previous work on this project I noticed that ElasticNet model does not work well during the fitting strictly for Ridge/Lasso. I will use ElasticNet for "middle" values for parameters. 

In [None]:
#ElasticNet

def get_score_preliminary_en(alpha, l1_ratio):
    scores = (-1)*cross_val_score(ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=1), X_train, y, cv=5, scoring='neg_mean_squared_error')
    return scores.mean()


In [None]:
#Ridge

def get_score_preliminary_r(alpha):
    scores = (-1)*cross_val_score(Ridge(alpha=alpha, random_state=1), X_train, y, cv=5, scoring='neg_mean_squared_error')
    return scores.mean()

In [None]:
#Lasso

def get_score_preliminary_l(alpha):
    scores = (-1)*cross_val_score(Lasso(alpha=alpha, random_state=1), X_train, y, cv=5, scoring='neg_mean_squared_error')
    return scores.mean()

In [None]:
# displaying results in intuitive way

def display_hm(alphas, l1_ratio, results):
    ax = sns.heatmap(results, linewidth=0.5, xticklabels=alphas, yticklabels=l1_ratio)
    plt.show()

In [None]:
def train_save_results_arr(a):
    
    # "a" parameter is for choosing how many "alphas" we want
    alphas_preliminary = [pow(10, i-2) for i in range(0, a)]
    l1_ratio_preliminary = [i/10 for i in range(0, 11)]

    #setting array and dictionary to store our results
    results_preliminary = np.zeros((len(l1_ratio_preliminary), len(alphas_preliminary)))
    results_preliminary_dict = {}

    #traning and saving results
    for i in range(0, len(alphas_preliminary)):
        results_preliminary[0, i] = get_score_preliminary_r(alphas_preliminary[i])  # Ridge for l1_ratio = 0
        results_preliminary_dict[0, i] = results_preliminary[0, i]

    for i in range(0, len(alphas_preliminary)):
        results_preliminary[10, i] = get_score_preliminary_l(alphas_preliminary[i])  # Lasso for l1_ratio = 1
        results_preliminary_dict[10, i] = results_preliminary[10, i]

    for j in range(0, len(l1_ratio_preliminary)-1):
        for i in range(0, len(alphas_preliminary)):
            results_preliminary[j+1, i] = get_score_preliminary_en(alphas_preliminary[i], l1_ratio_preliminary[j+1])  # ElasticNet for "middle" values
            results_preliminary_dict[j+1, i] = results_preliminary[j+1, i]

    # displaying results
    display_hm(alphas_preliminary, l1_ratio_preliminary,  results_preliminary)
    
    return results_preliminary, results_preliminary_dict 


In [None]:
results_preliminary, results_preliminary_dict = train_save_results_arr(8)

In [None]:
from heapq import nsmallest

In [None]:
smallets_10 = nsmallest(10, results_preliminary_dict, key = results_preliminary_dict.get)
for i in smallets_10:
    print(i, results_preliminary_dict[i])

Saving best parameters

In [None]:
id = min(results_preliminary_dict, key=results_preliminary_dict.get)
mse_preliminary = results_preliminary_dict[id]
punishment_preliminary = pow(10, id[1]-2)

We can see that good candidate for best model is Ridge Regression. My idea is to find best "punishment" value. I am doing it by 
1. taking best punishment from preliminary training
2. setting smaller and bigger "logarithmic" neigbour, I mean:
    1. we know that in preliminary training logarithmic interval beetween alphas was "1".
    2. for finding better punishment value we create new alphas values by taking values in the middle (logarithmically) of left and right current punishmet value's intervals
3. looping this proces till the moment when mse improvement is smaller than 0.05% 

In [None]:
import math

In [None]:
def determine_punishment_value(p_min, mse_min, log_interval):
    log_val = math.log10(p_min)
    new_log_interval = log_interval*0.5
    alphas = [pow(10, log_val - new_log_interval), pow(10, log_val + new_log_interval)] #[pow(-), "p_min," pow(+)]
    results = {}
    results[p_min] = mse_min
    for i in alphas:
        results[i] = get_score_preliminary_r(i)
    
    new_p_min = min(results, key=results.get)

    if p_min == new_p_min:
        return determine_punishment_value(p_min, mse_min, new_log_interval) #preventing from falsely marking p_min as "best" only because it was chosen again (improvement = 0)
    
    else:
        new_mse_min = results[new_p_min]
        improvement = (mse_min-new_mse_min)/mse_min
        lvl = 0.005

        if(improvement >= lvl):
            return determine_punishment_value(new_p_min, new_mse_min, new_log_interval)
        
        else:
            return new_p_min, new_mse_min 

In [None]:
best_punishment, best_mse = determine_punishment_value(punishment_preliminary, mse_preliminary, 1)
print("Best punishment: ", best_punishment, "\n Best mse: ", best_mse)