In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    SCORERS
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    PolynomialFeatures,
    StandardScaler,
)
from scipy.stats import loguniform

In [2]:
with open("../../data/processed/training_data.pickle", "rb") as f:
    train_df = pickle.load(f)
    
with open("../../data/processed/test_data.pickle", "rb") as f:
    test_df = pickle.load(f)
    
train_df.head()

train_df = train_df.loc[train_df["area"] > 0, :]

In [3]:
test_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_log,seasons
400,9,5,jun,wed,93.3,49.5,297.7,14.0,28.0,34,4.5,0.0,8.16,2.215,summer
379,4,5,jan,sun,18.7,1.1,171.4,0.0,5.2,100,0.9,0.0,0.0,0.0,winter
286,7,6,jul,wed,91.2,183.1,437.7,12.5,12.6,90,7.6,0.2,0.0,0.0,summer
125,3,4,oct,sun,92.6,46.5,691.8,8.8,20.6,24,5.4,0.0,0.0,0.0,fall
216,4,4,sep,sun,92.4,124.1,680.7,8.5,16.9,60,1.3,0.0,29.48,3.417,fall


In [4]:
train_df.describe(include="all")

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_log,seasons
count,215.0,215.0,215,215,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215.0,215
unique,,,9,7,,,,,,,,,,,4
top,,,aug,sat,,,,,,,,,,,summer
freq,,,79,35,,,,,,,,,,,98
mean,4.823256,4.283721,,,91.190233,114.143256,572.299535,9.306047,19.264186,43.651163,4.19907,0.036279,26.640093,2.142047,
std,2.339506,1.053905,,,3.378789,60.798861,233.226827,4.170828,6.092371,15.282211,1.959448,0.446361,95.128713,1.298188,
min,1.0,2.0,,,75.1,3.2,15.3,1.8,4.6,15.0,0.4,0.0,0.09,0.086,
25%,3.0,4.0,,,90.5,82.9,492.2,6.8,16.0,33.0,2.7,0.0,2.04,1.112,
50%,5.0,4.0,,,91.8,111.7,666.7,8.5,20.1,41.0,4.0,0.0,6.43,2.006,
75%,7.0,5.0,,,93.05,141.25,723.25,11.9,23.4,53.0,5.4,0.0,17.525,2.919,


In [5]:
test_df.describe(include="all")

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,area_log,seasons
count,104.0,104.0,104,104,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104
unique,,,11,7,,,,,,,,,,,4
top,,,aug,sun,,,,,,,,,,,summer
freq,,,35,22,,,,,,,,,,,49
mean,4.826923,4.548077,,,90.141346,114.915385,555.520192,9.220192,19.165385,44.028846,3.747115,0.001923,8.792596,1.094837,
std,2.4552,1.440603,,,8.101967,66.039101,233.504397,4.487834,5.89388,15.746448,1.682575,0.019612,27.692033,1.305209,
min,1.0,2.0,,,18.7,1.1,26.6,0.0,2.2,15.0,0.9,0.0,0.0,0.0,
25%,3.0,4.0,,,90.275,72.6,465.6,6.675,16.075,32.75,2.2,0.0,0.0,0.0,
50%,4.0,4.0,,,91.6,108.4,647.1,8.4,19.3,41.5,4.0,0.0,0.805,0.58,
75%,7.0,5.0,,,92.825,143.15,713.225,11.4,22.3,53.25,4.9,0.0,6.42,2.00425,


In [6]:
X_train, y_train = train_df.drop("area_log", axis=1), train_df["area_log"]
X_test, y_test = test_df.drop("area_log", axis=1), test_df["area_log"]

In [7]:
categorical = ["X", "Y", "seasons"]
numeric = ['FFMC','DMC', 'DC', 'ISI', 'temp', 'RH','wind']
drop = ["month", "day", "area", "seasons", "rain"]

In [8]:
def normalize(x):
    return (x - x.mean()) / x.std()

X_train.loc[:, numeric] = X_train[numeric].apply(np.log, axis=1).apply(normalize, axis=1)

In [9]:
column_transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical),
    ("drop", drop)
)

In [10]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [11]:
results = {}
scoring_metric = "r2"

dummy_pipe = make_pipeline(
    column_transformer,
    DummyRegressor()
)

results["DummyRegressor"] = mean_std_cross_val_scores(
    dummy_pipe,
    X_train,
    y_train,
    cv=10,
    scoring=scoring_metric,
    return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,DummyRegressor
fit_time,0.002 (+/- 0.001)
score_time,0.001 (+/- 0.000)
test_score,-0.152 (+/- 0.316)
train_score,0.000 (+/- 0.000)


In [12]:
svr_pipe = make_pipeline(
    column_transformer,
    SVR()
)

results["SVR"] = mean_std_cross_val_scores(
    svr_pipe,
    X_train,
    y_train,
    cv=5,
    scoring=scoring_metric,
    return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,DummyRegressor,SVR
fit_time,0.002 (+/- 0.001),0.005 (+/- 0.002)
score_time,0.001 (+/- 0.000),0.002 (+/- 0.000)
test_score,-0.152 (+/- 0.316),-0.122 (+/- 0.111)
train_score,0.000 (+/- 0.000),0.215 (+/- 0.012)


In [13]:
ridge_pipe = make_pipeline(
    column_transformer,
    Ridge()
)

results["Ridge"] = mean_std_cross_val_scores(
    ridge_pipe,
    X_train,
    y_train,
    cv=5,
    scoring=scoring_metric,
    return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,DummyRegressor,SVR,Ridge
fit_time,0.002 (+/- 0.001),0.005 (+/- 0.002),0.004 (+/- 0.004)
score_time,0.001 (+/- 0.000),0.002 (+/- 0.000),0.001 (+/- 0.000)
test_score,-0.152 (+/- 0.316),-0.122 (+/- 0.111),-0.126 (+/- 0.065)
train_score,0.000 (+/- 0.000),0.215 (+/- 0.012),0.166 (+/- 0.026)


In [14]:
rf_pipe = make_pipeline(
    column_transformer,
    RandomForestRegressor(n_estimators=50)
)

results["RandomForestRegressor"] = mean_std_cross_val_scores(
    rf_pipe,
    X_train,
    y_train,
    cv=10,
    scoring=scoring_metric,
    return_train_score=True
)

pd.DataFrame(results)

Unnamed: 0,DummyRegressor,SVR,Ridge,RandomForestRegressor
fit_time,0.002 (+/- 0.001),0.005 (+/- 0.002),0.004 (+/- 0.004),0.035 (+/- 0.002)
score_time,0.001 (+/- 0.000),0.002 (+/- 0.000),0.001 (+/- 0.000),0.003 (+/- 0.000)
test_score,-0.152 (+/- 0.316),-0.122 (+/- 0.111),-0.126 (+/- 0.065),-0.273 (+/- 0.222)
train_score,0.000 (+/- 0.000),0.215 (+/- 0.012),0.166 (+/- 0.026),0.334 (+/- 0.018)


In [15]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei