In [1]:
%autosave 0

Autosave disabled


# Rossmann: Random forest (Stage3 - Evaluate)

## Intro

Keggle Kernel: https://www.kaggle.com/paso84/xgboost-in-python-with-rmspe

## Usage

**Input parameters**
1. PROCESSED_TRAIN_CSV:The name of the file used to store the processed train data
1. MODEL_PKL: The file containing the serialized model to evaluate
1. MATRICS_OUT: The output file used to save calculate metrics

**Output**
1. A file that contains the calculated metrics


## Setup env

### Set global variables

In [2]:
!pwd

/opt/shared/notebooks


In [3]:
DATASETS_DIR = '../data'
MODELS_DIR = '../models'
METRICS_DIR = '../metrics'

In [4]:
# this cell is tagged `parameters`
PROCESSED_TRAIN_CSV = DATASETS_DIR + '/processed/tst-train.csv'
MODEL_PKL = MODELS_DIR + '/tst-model.pkl'
METRICS_OUT = METRICS_DIR + '/tst.metrics'

### Install required packages

Se il notebook è eseguito su una macchina pulita installare i pacchetti necessari con i seguenti comandi ...

In [5]:
#!curl https://raw.githubusercontent.com/andrea-gioia/boostrap.ai/master/???	 | bash

In [6]:
#!pip list

Se il notebook è eseguito all'interno di un ambiente virtuale conda con tutti i pacchetti specificati nel file di requirements già installati fare solo un check eseguendo i seguenti comandi ...

###  Dump environment

In [7]:
!python -V

Python 3.7.4


In [8]:
!conda env list

# conda environments:
#
base                     /opt/conda
custom                *  /opt/conda/envs/custom



In [9]:
#!conda list

In [10]:
#!pip list

### Import packages

In [11]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [14]:
from fastai.imports import *
import sys
import pandas as pd
from sklearn import model_selection
import xgboost as xgb
import pickle
import datetime
import numpy as np

### Set random seed

In [15]:
# Set a seed value: 
seed_value= 42  


# Set `python` built-in pseudo-random generator at a fixed value: 
random.seed(seed_value) 

# Set `numpy` pseudo-random generator at a fixed value:
np.random.seed(seed_value) 

# Set `torch` pseudo-random generator at a fixed value:
torch.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False
    
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

### Define shared functions

In [16]:
# nope

## Stage 3: evaluate

### Define loss functions

In [17]:
def rmse(p,a): return np.sqrt(((a-p)**2).mean())

# ritorna un vettore w in cui w_i = y_i^-2 se i!=0, 0 altrimnti
# serve per ignorare dalla misura finale i casi in cui la variabile y da predirre e nulla
def toWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w


def rmspe(p, a):
    w = toWeight(a)
    rmspe = np.sqrt(np.mean( w * (a - p)**2 ))
    return rmspe

### Fill training set and validation set

In [18]:
df_processed_train = pd.read_csv(PROCESSED_TRAIN_CSV)
print('The input data frame {} size is {}\n'.format(PROCESSED_TRAIN_CSV, df_processed_train.shape))

df_processed_train = df_processed_train.loc[:, df_processed_train.columns != 'Date']
df_train, df_valid = model_selection.train_test_split(df_processed_train, test_size=.25, shuffle=False)
print('Train set size: {}; Validation set size: {}\n'.format(df_train.shape[0], df_valid.shape[0]))

X_train = df_train.loc[:, df_train.columns != 'Sales']
y_train = df_train['Sales']

X_valid = df_valid.loc[:, df_valid.columns != 'Sales']
y_valid = df_valid['Sales']

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

The input data frame ../data/processed/tst-train.csv size is (1017209, 37)

Train set size: 762906; Validation set size: 254303



((762906, 35), (762906,), (254303, 35), (254303,))

### Load model

In [19]:
rfm = pickle.load(open(MODEL_PKL, 'rb'))

### Calculate metrics

In [20]:
def calculate_metrics(m, lossfunct=rmse):
    lf_train = lossfunct(m.predict(X_train), y_train)
    lf_valid = lossfunct(m.predict(X_valid), y_valid)
    r2_train = m.score(X_train, y_train)
    r2_valid = m.score(X_valid, y_valid)
    res = [lf_train, lf_valid,  
           r2_train, r2_valid]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    return res

In [21]:
metrics = calculate_metrics(rfm, lossfunct=rmspe)
metrics

[0.02639480588395337,
 0.06847645742979377,
 0.9974198433116129,
 0.9797561541455969]

### Save metrics

In [22]:
with open(METRICS_OUT, 'w') as fd:
    fd.write('rmspe(train): {}\n'.format(metrics[0]))
    fd.write('rmspe(valid): {}\n'.format(metrics[1]))

In [23]:
METRICS_OUT

'../metrics/tst.metrics'

In [24]:
!cat {METRICS_OUT}

rmspe(train): 0.02639480588395337
rmspe(valid): 0.06847645742979377


In [None]:
# FINE