In [None]:
# !kaggle competitions download -c bluebook-for-bulldozers
# !unzip bluebook-for-bulldozers.zip
# !rm *.zip *.7z

In [None]:
# Data manipulations libraries
import pandas as pd
import numpy as np
 
# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
 
# Machine learning imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
 
# Misc
import joblib

In [None]:
INPUT_PATH = "../input/bluebook-for-bulldozers/"
OUTPUT_PATH = "/kaggle/working"
# INPUT_PATH = "./"
# OUTPUT_PATH = "./"

In [None]:
# This is a utility function to display all columns and rows of a dataframe.
def display_all(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df)

# Problem Definition

## 🚜 Predicting the Sale Price of Bulldozers using Machine Learning 🚜

![Competition image](https://storage.googleapis.com/kaggle-competitions/kaggle/3316/media/bulldozer.jpg)

The goal of the contest is to predict the sale price of a particular piece of heavy equiment at auction based on it's usage, equipment type, and configuaration.  The data is sourced from auction result postings and includes information on usage and equipment configurations.

Fast Iron is creating a "blue book for bull dozers," for customers to value what their heavy equipment fleet is worth at auction.

## About Fast Iron
This competition was launched under the [Kaggle Startup Program](https://medium.com/kaggle-blog). If you're a startup with a predictive modelling challenge, please apply!

# What data do we have?

For this competition, you are predicting the sale price of bulldozers sold at auctions.

The data for this competition is split into three parts:

* **Train.csv** is the training set, which contains data through the end of 2011.
* **Valid.csv** is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.
* **Test.csv** is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.

The key fields are in train.csv are:

* `SalesID`: the unique identifier of the sale
* `MachineID`: the unique identifier of a machine.  A machine can be sold multiple times
* `saleprice`: what the machine sold for at auction (only provided in train.csv)
* `saledate`: the date of the sale

There are several fields towards the end of the file on the different options a machine can have.  The descriptions all start with `machine configuration` in the data dictionary.  Some product types do not have a particular option, so all the records for that option variable will be null for that product type.  Also, some sources do not provide good option and/or hours data.

The **machine_appendix.csv** file contains the correct year manufactured for a given machine along with the make, model, and product class details. There is one machine id for every machine in all the competition datasets (training, evaluation, etc.).

In [None]:
# Load TrainAndValid.csv, parsing saledate as datetime
train_df = pd.read_csv(INPUT_PATH + "TrainAndValid.csv",
                       low_memory=False,
                       parse_dates=["saledate"])
train_df.head().T

Also, `SalePrice` is the feature that we want to predict (a.k.a *dependent variable*).

In [None]:
# Print information about the dataframe
train_df.info()

We have a mix of **numerical** and **string** features. Also, a lot of the features (machine configuration, mainly) does have missing values.

## Target variable analysis
Let's check the `SalePrice` distribution.

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
sns.distplot(train_df.SalePrice, label="SalePrice", bins=50, ax=ax0, kde=True)
ax0.legend()
sns.boxplot(x=train_df.SalePrice, ax=ax1)
plt.show();

The high positive skewness presented by the `SalePrice` distribution indicates that in the majority of the sales were made with prices lower than the mean sale price. This is evidenced by the box plot of the distribution.

## Feature correlation analysis

We'll look into the correlation between features and try to identify which ones have high incluence on the target variable.

### Numerical features

In [None]:
# Describing the numerical data.
desc_df = train_df.describe().T

# Add more useful information
desc_df["% non-null"] = desc_df["count"] / len(train_df)

desc_df

The overall of numerical features is present in all entries, excluding `auctioneerID` and `MachineHoursCurrentMeter` (which is absent in 50% of the dataset).

Let's check the feature correlations.

In [None]:
# Ploting feature correlation matrix (only numerical features).
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(train_df.corr(), annot=True, cmap="YlGnBu", cbar=False, ax=ax)
plt.show()

The only features that have significant correlation with `SalePrice` are `YearMade` and `MachineID`, so we'll investigate them more deeply.

#### YearMade

`YearMade` indicates the registered `year of manufactoring` of the auctioned machine. Let's check the distribution of the sales and mean prices of each year.

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(15, 10))

# Plotting the sell count for each year
sns.countplot(x="YearMade",
              data=train_df,
              palette=sns.color_palette("Blues_d"),
              ax=ax0)
ax0.set_ylabel("Number of Sales")

# Plotting the mean price for each year
sns.barplot(x="YearMade",
            y="SalePrice",
            data=train_df,
            palette=sns.color_palette("Blues_d"),
            ax=ax1)
ax1.set_ylabel("Sale Prices Mean")

plt.xticks(rotation=90)
plt.show()

Year 1000 concentrates a high amount of manufactured machines. It's not clear if it's data collection error or just a placeholder for an unregistered year, so we'll just leave it as it is.

Aside from that, from the first plot we can see most of the auctioned machines were made around years 1998 and 2005, with few machines made in years after 2008.

Distribution of prices per year of manufactoring  is as expected. Newer generally machines cost higher than the old ones, with few exceptions.

#### MachineID

The `MachineID` feature denotes a identifier for a particular machine. Machines may have multiple sales, so we can expect same values in some entries. Let's check for unique values.

In [None]:
# Print the rate of unique values for MachineID
n_uniques = len(train_df.MachineID.unique())
uniques_rate = n_uniques / len(train_df)
print(f"Number of unique MachineIDs: {n_uniques} -- Rate of uniques: {uniques_rate}")

OK! `MachineID` has *high cardinality*, so it's unlikely that its distribution will make some sense.

Let's plot a scatter plot of the `SalePrice` by `MachineID`. To support our understanding, we'll use the information from `datasource`, a feature that have a significant correlation with `MachineID`, for the plotting.

In [None]:
train_df.plot.scatter(x="MachineID", y="SalePrice", c="datasource", figsize=(15, 10));

At first, we can't draw any assumption from this distribution aside that the majority of seliing prices are located at the lower part of the plotting, indicating that much of the prices are below the mean price as we have seem before. Also, it seems that some data sources are specialized at the reporting of specific machines.

### Non-numerical features

In [None]:
# Describe non-numerical features
desc_df = train_df.describe(include="O").T

# Add more useful information.
desc_df["% non-null"] = desc_df["count"] / len(train_df)

desc_df

Of all non-numerical features, only 6 of them are complete. Their majority are even not present in 50% of the data, so it may be difficult to find a correlation between any of them and `SalePrice`. But it doesn't mean that we'll justo drop them.

For example, lets take a look at `Blade_Extension` feature.

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(15, 10))

# We added the missing values as as separate class for ilustration purposes.
plt_data = train_df.fillna({"Blade_Extension": "missing"})

# Plot the SalePrice distribution per class. 
sns.boxplot(x="Blade_Extension", y="SalePrice", data=plt_data, ax=ax0)
ax0.set(xlabel="")

# Plot the class counts.
sns.countplot(x="Blade_Extension", data=plt_data, ax=ax1)

plt.show()

As we can see, the class information doesn't give us any valuable information, as the classes distributions doesn't differ from each order signicantly. But the lack of class specification, by itself, seemms to be imbued with some predictive information. Our feature engineering will have to consider this kind of information.

# What defines success?

The evaluation metric for this competition is the **RMSLE** (*root mean squared log error*) between the actual and predicted auction prices.

$$
RMSLE = \sqrt{\frac{1}{n} \sum_{i = n}^n (log(predicted_i + 1) - log(actual_i - 1)^2}
$$

Although Scikit-Learn doesn't have a default implementation of this metric, the implementation is straight forward:

In [None]:
# Implementation of RMSLE.
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

Sample submission files can be downloaded from the data page. Submission files should be formatted as follows:

* Have a header: "`SalesID`,`SalePrice`"
* Contain two columns
    * `SalesID`: SalesID for the validation set in sorted order
    * `SalePrice`: Your predicted price of the sale

# Feature Engineering?

## Extracting information from "saledate"

The `saledate` feature informs the **datatime** of the sale of each entry onf the dataset. This type of feature is known as *Time-series* data, and all data scientists will agree that this is one of the most important data types for a in the development of machine learning models.

We'll extract as many as possible of information that this feature provides.

In [None]:
def add_date_parts(df):
    saledate = df.saledate

    df["sale_day"] = saledate.dt.day
    df["sale_month"] = saledate.dt.month
    df["sale_quarter"] = saledate.dt.quarter
    df["sale_year"] = saledate.dt.year
    df["sale_dayofweek"] = saledate.dt.dayofweek
    df["sale_dayofyear"] = saledate.dt.dayofyear
    df["sale_weekofyear"] = saledate.dt.weekofyear
    df["sale_is_month_start"] = saledate.dt.is_month_start
    df["sale_is_month_end"] = saledate.dt.is_month_end
    df["sale_is_quarter_start"] = saledate.dt.is_quarter_start
    df["sale_is_quarter_end"] = saledate.dt.is_quarter_end
    df["sale_is_year_start"] = saledate.dt.is_year_start
    df["sale_is_year_end"] = saledate.dt.is_year_end

    # Get rid of "saledate" column
    df.drop("saledate", axis=1, inplace=True)
    
    return df

In [None]:
train_df = add_date_parts(train_df)
display_all(train_df.head().T)

## Downcast

We'll downcast the Dataframe to reduce the amount of memory used and speed up the operations that we'll perform later.

* **Numerical Columns:** Depending on your environment, pandas automatically creates int32, int64, float32 or float64 columns for numeric ones. If you know the min or max value of a column, you can use a subtype which is less memory consuming. You can also use an unsigned subtype if there is no negative value.
Here are the different subtypes you can use:  
`int8` / `uint8` : consumes 1 byte of memory, range between -128/127 or 0/255  
`bool` : consumes 1 byte, true or false  
`float16` / int16 / uint16: consumes 2 bytes of memory, range between -32768 and 32767 or 0/65535  
`float32` / `int32` / `uint32` : consumes 4 bytes of memory, range between -2147483648 and 2147483647  
`float64` / `int64` / `uint64`: consumes 8 bytes of memory  
If one of your column has values between 1 and 10 for example, you will reduce the size of that column from 8 bytes per row to 1 byte, which is more than 85% memory saving on that column!


* **Categorical Columns:** Pandas stores categorical columns as objects. One of the reason this storage is not optimal is that it creates a list of pointers to the memory address of each value of your column. For columns with low cardinality (the amount of unique values is lower than 50% of the count of these values), this can be optimized by forcing pandas to use a virtual mapping table where all unique values are mapped via an integer instead of a pointer. This is done using the category datatype.

In [None]:
def downcast(df):
    """
    Downcasts the columns of a Dataframe in order to save memory
    """
    df_copy = df.copy()
    
    for nm, col in df_copy.items():
        if pd.api.types.is_integer_dtype(col):
            col_min, col_max = col.min(), col.max()
            if (col_min > np.iinfo(np.int8).min
                    and col_max < np.iinfo(np.int8).max):
                df_copy[nm] = col.astype(np.int8)
            elif (col_min > np.iinfo(np.int16).min
                  and col_max < np.iinfo(np.int16).max):
                df_copy[nm] = col.astype(np.int16)
            elif (col_min > np.iinfo(np.int32).min
                  and col_max < np.iinfo(np.int32).max):
                df_copy[nm] = col.astype(np.int32)
            else:
                df_copy[nm] = cols.astype(np.int64)
        elif pd.api.types.is_float_dtype(col):
            col_min, col_max = col.min(), col.max()
            #-----------------------------------------------------------
            # In pandas stable, half floats (float16) is not implemented
            #-----------------------------------------------------------
            # if (col_min > np.finfo(np.float16).min
            #         and col_max < np.finfo(np.float16).max):
            #     df_copy[nm] = col.astype(np.float16)
            # elif (col_min > np.finfo(np.float32).min
            #-----------------------------------------------------------
            if (col_min > np.finfo(np.float32).min
                  and col_max < np.finfo(np.float32).max):
                df_copy[nm] = col.astype(np.float32)
            else:
                df_copy[nm] = cols.astype(np.float64)
        elif pd.api.types.is_object_dtype(col):
            df_copy[nm] = col.astype("category")
            
    return df_copy

In [None]:
old_memory_usage = train_df.memory_usage(index=True, deep=True).sum()

In [None]:
train_df = downcast(train_df)
train_df.info()

In [None]:
new_memory_usage = train_df.memory_usage(index=True, deep=True).sum()

memory_gain_ration = new_memory_usage / old_memory_usage
print(f"Memory usage before/after downcasting: {old_memory_usage} / {new_memory_usage} -- Memory gain: {((old_memory_usage - new_memory_usage) / old_memory_usage * 100):.2f}%")

The processed dataset now has less than 5% of the original size in memory. A thanks from our computers.

Also, all object type features were transformed into categories

In [None]:
train_df.state.cat.categories

In [None]:
train_df.state.cat.codes

# Preprocessing

Before we proceed, we would like to keep the actual state of data, for future rerefences.

In [None]:
train_df.to_feather(OUTPUT_PATH + "TrainAndValid_raw.feather")

In [None]:
train_df = pd.read_feather(OUTPUT_PATH + "TrainAndValid_raw.feather")

## Converting categories to numbers

In [None]:
for nm, col in train_df.items():
    if pd.api.types.is_categorical_dtype(col):
        # Replace the categorical values with their codes.
        # As the missing values are represented with category code "-1",
        # we add 1 to the codes. So all code values are positive
        train_df[nm] = col.cat.codes + 1

## Filling missing numerical values with the median values of each column

In [None]:
for nm, col in train_df.items():
    # Search column for missing values
    is_missing = pd.isnull(col)
    # Check if column type is numerical
    if pd.api.types.is_numeric_dtype(col) and is_missing.sum():
        # Create a missing values indicator column
        train_df[nm + "_is_missing"] = is_missing
        # Fill missing values
        train_df[nm] = col.fillna(col.median())

In [None]:
train_df.info()

In [None]:
display_all(train_df.head().T)

In [None]:
display_all(train_df.isna().sum() / len(train_df))

## Saving processed dataset

In [None]:
train_df.to_feather(OUTPUT_PATH + "TrainAndValid_preprocessed.feather")

In [None]:
train_df = pd.read_feather(OUTPUT_PATH + "TrainAndValid_preprocessed.feather")

# Modelling

## Train and Validation sets
We'll split the DataFrame in train and validation sets. Quoting the Kaggle's description of the dataset.

> * **Train.csv** is the training set, which contains data through the end of 2011.
* **Valid.csv** is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.

In [None]:
valid_df = train_df[train_df.sale_year == 2012]
train_df = train_df[train_df.sale_year != 2012]

# Split the sets in independent variables and dependent variables
X_train, y_train = train_df.drop("SalePrice", axis=1), train_df.SalePrice
X_valid, y_valid = valid_df.drop("SalePrice", axis=1), valid_df.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

## Evaluation function

Now, we'll build our first model. But first, let's implement and evaluation function, that will compute three performance metrics:

* *Mean Absolute error* (**MAE**)
* *R<sup>2</sup> regression score*
* *Root Mean Squared Logarithmic Error* (**RMSLE**)

In [None]:
def score_model(model):
    """
    Computes the MAE, R2 and RMSLE scores.
    """
    train_pred = model.predict(X_train)
    valid_pred = model.predict(X_valid)
    return {
        "Train MAE": mean_absolute_error(y_train, train_pred),
        "Valid MAE": mean_absolute_error(y_valid, valid_pred),
        "Train R2": r2_score(y_train, train_pred),
        "Valid R2": r2_score(y_valid, valid_pred),
        "Train RMSLE": root_mean_squared_log_error(y_train, train_pred),
        "Valid RMSLE": root_mean_squared_log_error(y_valid, valid_pred),
    }

## Fit a Random Forest Regressor

Now, we'll fit a Random Forest Regressor with default parameters.

In [None]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)
 
model.fit(X_train, y_train)
 
model.score(X_valid, y_valid)

In [None]:
# Using our evaluation function
score = score_model(model)
score

Not bad for our first model, and we have not touched the hyperparameters.

But can we do better? Let's check out.

## RandomizedSearchCV

In [None]:
# Grid parameters.
rs_params = {
    "n_estimators": 2 ** np.arange(1, 7, 2) * 10,
    "max_features": [0.3, 0.5, "auto", "sqrt", "log2"],
    "max_depth": np.arange(5, 36, 10),
    "min_samples_leaf": np.arange(1, 7, 2),
    "min_samples_split": np.arange(10, 17, 2),
    "max_samples": [1000]
}
 
# Instantiate  the grid search class
rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rs_params,
                              n_jobs=-1,
                              n_iter=200,
                              verbose=True,
                              random_state=42)
 
rs_model.fit(X_train, y_train)

In [None]:
rs_model.best_params_

In [None]:
joblib.dump(rs_model, OUTPUT_PATH + "rs_model.bz2", compress=True)

In [None]:
rs_model = joblib.load(OUTPUT_PATH + "rs_model.bz2")

Now, let's check how well does the found model is fitted.

In [None]:
%%time
score_model(rs_model)

## Fit a RandomForest with the best found hyperparameters.

In [None]:
%%time
model = RandomForestRegressor(
    n_estimators=rs_model.best_params_["n_estimators"],
    max_depth=rs_model.best_params_["max_depth"],
    max_features=rs_model.best_params_["max_features"],
    min_samples_leaf=rs_model.best_params_["min_samples_leaf"],
    min_samples_split=rs_model.best_params_["min_samples_split"],
    n_jobs=-1,
    random_state=42
    )

model.fit(X_train, y_train)

In [None]:
%%time
score_model(model)

In [None]:
joblib.dump(model, OUTPUT_PATH + "model.bz2", compress=True)

In [None]:
model = joblib.load(OUTPUT_PATH + "model.bz2")

##  Reducing overfitting

The obtained model seems to be overfitted to the training set. By adjusting some parameters we can enforce variation in the estimators (random trees) pertaining to the model, improving its generalization.

During process of RandomSearchCV, some of those parameters were tested, and some of their chosen values are already good:
* **n_estimators:** In general the more trees the less likely the algorithm is to overfit. So try increasing this. The lower this number, the closer the model is to a decision tree, with a restricted feature set.
* **max_features:** This determines how many features each tree is randomly assigned. The smaller, the less likely to overfit, but too small will start to introduce under fitting.
* **max_depth:** This will reduce the complexity of the learned models, lowering over fitting risk. Try starting small, say 5-10, and increasing you get the best result.
* **min_samples_leaf:** This has a similar effect to the max_depth parameter, it means the branch will stop splitting once the leaves have that number of samples each.

The discussion about these informations can be found in this stackoverflow thread:
https://stackoverflow.com/questions/20463281/how-do-i-solve-overfitting-in-random-forest-of-python-sklearn

As said before, `n_estimators`, `max_depth` and `min_samples_leaf` parameter values seems good, but the `max_features` value seems strange. According to the *Scikit-Learn* documentation about the *RandomForestClassfier*:

> The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a fraction and
  `int(max_features * n_features)` features are considered at each
  split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

So, with value our `auto`, the classifier will use all features for the splits. Let's change these values a bit to see if the predictions can be improved.

In [None]:
# We'll train a classifier that uses only half o features for the splits.
model = RandomForestRegressor(n_estimators=320,
                              max_features=0.5,
                              max_depth=25,
                              min_samples_leaf=3,
                              min_samples_split=10,
                              n_jobs=-1,
                              random_state=42)

In [None]:
%%time
model.fit(X_train, y_train)

In [None]:
%%time
score_model(model)

In [None]:
joblib.dump(model, OUTPUT_PATH + "model_max_features.bz2", compress=True)

In [None]:
model = joblib.load(OUTPUT_PATH + "model_max_features.bz2")

# Preparing for submission

## Pre-processing function

In [None]:
df_raw = pd.read_feather(OUTPUT_PATH + "TrainAndValid_raw.feather")

In [None]:
def adjust_types(df, ref_df):
    df_copy = df.copy()
    
    for nm, col in df_copy.items():
        if pd.api.types.is_categorical_dtype(col):
            categories = ref_df[nm].cat.categories
            df_copy[nm] = pd.Categorical(col, categories=categories, ordered=True)
        else:
            df_copy[nm] = col.astype(ref_df[nm].dtype)
    
    return df_copy


def preprocess(df, ref_df=None):
    
    df_copy = df.copy()
    
    df_copy = add_date_parts(df_copy)
    
    if ref_df is None:
        df_copy = downcast(df_copy)
    else:
        df_copy = adjust_types(df_copy, ref_df)
    
    for nm, col in df_copy.items():
        is_missing = pd.isnull(col)
        if pd.api.types.is_numeric_dtype(col):
            if ref_df is None:
                if is_missing.sum():
                    df_copy[nm + "_is_missing"] = is_missing
                    # Fill missing values with col median of df_copy
                    df_copy[nm] = col.fillna(col.median())
            else:
                ref_col = ref_df[nm]
                ref_have_missing = pd.isnull(ref_col).sum()
                if ref_have_missing:
                    df_copy[nm + "_is_missing"] = is_missing
                    # Fill missing values with col median of ref_df
                    df_copy[nm] = col.fillna(ref_col.median())
                
        elif pd.api.types.is_categorical_dtype(col):
            df_copy[nm] = col.cat.codes + 1
    
    return df_copy

## Loading the Testing set

In [None]:
test_df = pd.read_csv(INPUT_PATH + "Test.csv", low_memory=False, parse_dates=["saledate"])
test_df.head().T

In [None]:
test_df.info()

## Pre-process the Testing set

In [None]:
test_df = preprocess(test_df, df_raw)
display_all(test_df.head().T)

In [None]:
test_df.info()

## Make predictions on the Testing set.

In [None]:
# Make the predictions
test_preds = model.predict(test_df)

# Prepare the submission Dataframe
submission_df = pd.DataFrame()
submission_df["SalesID"] = test_df["SalesID"]
submission_df["SalesPrice"] = test_preds

display_all(submission_df)

In [None]:
# Save the predictions file.
submission_df.to_csv("test_predictions.csv")