## Data
There are three main datasets:

* Train.csv is the training set, which contains data through the end of 2011.
* Valid.csv is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.
* Test.csv is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition

The key fields are in train.csv are:

* SalesID: the uniue identifier of the sale
* MachineID: the unique identifier of a machine.  A machine can be sold multiple times
* saleprice: what the machine sold for at auction (only provided in train.csv)
* saledate: the date of the sale

## Evaluation
The evaluation metric is the RMSLE (root mean squared log error) between the actual and predicted auction prices.

In [None]:
# Importing libraries for EDA
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import FuncFormatter 
from matplotlib.ticker import StrMethodFormatter 
import seaborn as sns

In [None]:
df_raw = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/TrainAndValid.csv', 
                 low_memory=False, parse_dates=['saledate'])
df_raw.head()

In [None]:
df_raw.info()

In [None]:
df_raw.isna().sum()

In [None]:
# Sort the dataframe in order by date
df_raw.sort_values(by=['saledate'], inplace=True, ascending=True)
df_raw['saledate'].head(10)

In [None]:
# Copy the dataframe
data = df_raw.copy()

### Data Cleaning and Feature Engineering

In [None]:
data['saleYear'] = data.saledate.dt.year
data['saleMonth'] = data.saledate.dt.month
data['saleDay'] = data.saledate.dt.day
data['saleDayofWeek'] = data.saledate.dt.dayofweek
data['saleDayofYear'] = data.saledate.dt.dayofyear

data.drop('saledate', axis=1, inplace=True)

In [None]:
# Columns that contains strings
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# Converting strings to categories
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        data[label] = content.astype('category').cat.as_ordered()

In [None]:
data.info()

In [None]:
data.isna().sum()/len(data)

In [None]:
# Check for which numeric columns have null values
for label, content in data.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Check the numeric columns and fill with median
for label, content in data.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column to tell if the data was missing or not
            data[label + '_is_missing'] = pd.isnull(content)
            
            # Fill the missing values with median 
            data[label] = content.fillna(content.median())

In [None]:
# Columns which aren't numeric
for label, content in data.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Turn the categorical variables into numbers
for label, content in data.items():
    if not pd.api.types.is_numeric_dtype(content):
        # add binary column to check missing value
        data[label + '_is_missing'] = pd.isnull(content)
        
        # Add +1 because pandas encodes the missing categories as -1
        data[label] = pd.Categorical(content).codes + 1

In [None]:
data.info()

In [None]:
data.isna().sum()

### Splitting Data into Training and Validation sets

In [None]:
# Splitting the data into training and validation
# Training = all samples up until 2011
# Valid = all samples form January 1, 2012 - April 30, 2012
# Test = all samples from May 1, 2012 - November 2012
data_train = data[data['saleYear'] != 2012]
data_valid = data[data['saleYear'] == 2012]

In [None]:
# Split data into X and y
X_train, y_train = data_train.drop('SalePrice', axis=1), data_train.SalePrice
X_valid, y_valid = data_valid.drop('SalePrice', axis=1), data_valid.SalePrice

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

## Modelling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSErr
from sklearn.metrics import mean_absolute_error as MAErr
from sklearn.metrics import r2_score

Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.coef_)

In [None]:
s = StandardScaler()
X_train_ss = s.fit_transform(X_train)
lr2 = LinearRegression()
lr2.fit(X_train_ss, y_train)
print(lr2.coef_)

In [None]:
pd.DataFrame(zip(X_train.columns, lr2.coef_)).sort_values(by=1)

In [None]:
kf = KFold(shuffle=True, random_state=42, n_splits=3)

Optimization Function

In [None]:
#Retrieve R2 scores for different alpha for LASSO or Ridge.
def optimize_alpha(alphas, x, y, model, kf):
    
    #Scale and transform x.
    s = StandardScaler()
    x = s.fit_transform(x)
    
    #List of R2.
    r2_scores = []
    
    for alpha in alphas:
        
        reg = model(alpha = alpha, max_iter = 5e4)
        y_pred = cross_val_predict(reg, x, y, cv = kf)
        score = r2_score(y, y_pred)
        r2_scores.append(score)
    
    return(r2_scores)

In [None]:
def alpha_r2_graph(alphas, R2s, xlabels, model):
    
    df = pd.DataFrame(data = {'alpha': alphas,
                              'R2': R2s})
    sns.set()

    #Scatter Plot.
    sns.lineplot(data = df,
                 x = 'alpha',
                 y = 'R2',
                 marker = 'o')
    
    #Size.
    plt.gcf().set_size_inches(15, 6.92)
    paper_rc = {'lines.linewidth': 2, 'lines.markersize': 6}  
    
    #Axes.
    ax = plt.gca()
    
    #Title setup.
    ax.set_title("Optimizing Hyperparameter for {} Regression".format(model), fontsize = 24)

    #X-axis setup.
    ax.set_xlabel("α", fontsize = 22)
    ax.set_xscale('log')
    ax.set_xticks(xlabels)
    ax.set_xticklabels(xlabels, rotation = 45, ha = 'right')
    if (model == 'Ridge') :
        ax.get_xaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ',')))

    #Y-axis setup.
    ax.set_ylabel("R2", fontsize = 22)
    ylabels = [0, 0.2, 0.4, 0.6, 0.8, 1]
    ax.set_xticks(xlabels)
    
    ax.tick_params(axis = 'both', which = 'major', labelsize = 14)

In [None]:
# Lasso
alphas = np.array([1e-5, 5e-5, 0.0001, 0.0005])
xlabels = [a for a in np.array([1e-5, 5e-5, 0.0001, 0.0005])]

s = StandardScaler()
X_train_lasso = s.fit_transform(X_train)

#Determine R2s and graph.
r2s_l = optimize_alpha(alphas, X_train_lasso, y_train, Lasso, kf)
alpha_r2_graph(alphas, r2s_l, xlabels, 'LASSO')

Optimal Alpha is: 

In [None]:
X_test_opt_alpha = s.transform(X_valid)

lr3 = Lasso(alpha = 0.005).fit(X_train_3, y_train)

In [None]:
# Ridge
alphas_r = [0.005, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 80]
xlabels_r = [a for a in alphas]

s = SS()
X_train_ridge = s.fit_transform(X_train)

#Determine R2s and graph.
r2s_r = optimize_alpha(alphas_r, X_train_ridge, y_train, Ridge, kf)
alpha_r2_graph(alphas_r, r2s_r, xlabels_r, 'Ridge')

Optimal alpha is: 

In [None]:
X_test_4 = s.transform(X_valid)

lr4 = Ridge(alpha = 200).fit(X_train_4, y_train)

Evaluation

In [None]:
#Inputs regression models, predictors, and y-values, outputs DataFrame of R2, Adjusted R2, RMSE, and MAE.
def summary_df(models, Xs, Y) :

    index = ['All Variables', 'My Variables', 'LASSO', 'Ridge']
    R2 = []
    ADJ_R2 = []
    RMSE = []
    MAE = []

    for i in range(4):
        y_pred = models[i].predict(Xs[i])
        
        #R2.
        r2 = r2_score(Y, y_pred)
        R2.append(r2)
        
        #Adj R2.
        adj_r2 = 1.0 - (1.0 - r2) * (len(Y) - 1.0) / (len(Y) - Xs[i].shape[1] - 1.0)
        ADJ_R2.append(adj_r2)
        
        #RMSE.
        rmse = math.sqrt(MSErr(Y, y_pred))
        RMSE.append(rmse)
                         
        #MAE.
        mae = MAErr(Y, y_pred)
        MAE.append(mae)

    df = pd.DataFrame(data = {'R2': R2,
                              'Adjusted R2': ADJ_R2,
                              'RMSE': RMSE,
                              'MAE': MAE},
                      index = index)
    return(df)

In [None]:
models = [lr, lr2, lr3, lr4]
X_trains = [X_train, X_train_ss, X_train_lasso, X_train_ridge]

summary_df(models, X_trains, y_train)