In [7]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Import data

In [8]:
load_all = True
if load_all:
    # Load all
    df = pd.read_csv("all_episodes.csv", header=None)
else:
    # Load first 1000 lines
    df = pd.read_csv("all_episodes.csv", nrows=1000, header=None)
print(df.shape)

(59250, 147)


In [9]:
df.iloc[:, 143:148].describe()

Unnamed: 0,143,144,145,146
count,59250.0,59250.0,59250.0,59250.0
mean,732.323797,0.113553,0.0,0.007994
std,130.638931,0.52306,0.0,0.026082
min,581.0,0.0,0.0,0.0
25%,613.0,0.0,0.0,0.0
50%,710.0,0.0,0.0,0.0
75%,810.0,0.0,0.0,0.0
max,1485.0,9.0,0.0,0.366667


In [10]:
features_nb = 113
actions_nb = 30
rewards_nb = 4
total_nb = features_nb + actions_nb + rewards_nb
print(features_nb, actions_nb, rewards_nb, '|', total_nb)

113 30 4 | 147


In [11]:
scale = False

if scale:
    # Scale
    scaler = StandardScaler()
    scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df))
else:
    df_scaled = df
    df_scaled[143]=df_scaled[143]/2000
    
df_scaled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,137,138,139,140,141,142,143,144,145,146
count,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,...,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0,59250.0
mean,0.013415,0.322093,0.13792,0.347885,0.271607,0.0,0.064846,0.0,0.089199,0.122171,...,0.0,0.005637,0.00238,0.00216,0.002042,0.000186,0.366162,0.113553,0.0,0.007994
std,0.030319,0.066764,0.158988,0.025922,0.345128,0.0,0.019556,0.0,0.025544,0.054507,...,0.0,0.07487,0.048725,0.04643,0.045145,0.013624,0.065319,0.52306,0.0,0.026082
min,0.0,0.0,0.003069,0.001001,0.005217,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2905,0.0,0.0,0.0
25%,0.0,0.33,0.013299,0.338338,0.005217,0.0,0.054054,0.0,0.070175,0.075949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.3065,0.0,0.0,0.0
50%,0.0,0.33,0.03376,0.338338,0.010783,0.0,0.054054,0.0,0.087719,0.075949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.355,0.0,0.0,0.0
75%,0.0,0.34,0.340665,0.35035,0.712696,0.0,0.063063,0.0,0.105263,0.151899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.405,0.0,0.0,0.0
max,0.277778,0.47,0.761125,0.482482,0.847304,0.0,0.216216,0.0,0.263158,0.373418,...,0.0,1.0,1.0,1.0,1.0,1.0,0.7425,9.0,0.0,0.366667


In [12]:
# Split X, r
X = df_scaled.iloc[:, : features_nb+actions_nb]
r = df_scaled.iloc[:, features_nb+actions_nb : total_nb]

print(X.shape, r.shape)

# Offset
X = X.drop(index=len(X)-1, axis=0) # Drop the last one
r = r.drop(index=0, axis=0) # Drop the first one

print(X.shape, r.shape)

(59250, 143) (59250, 4)
(59249, 143) (59249, 4)


In [13]:
max(X.describe().loc['mean'])

0.9995442961062634

## Sklearn

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

In [15]:
X_train, X_test, r_train, r_test = train_test_split(X, r, test_size=0.2, random_state=42, shuffle=True)

In [16]:
X_train.shape, X_test.shape, r_train.shape, r_test.shape

((47399, 143), (11850, 143), (47399, 4), (11850, 4))

In [17]:
from sklearn.metrics import mean_squared_error, explained_variance_score, max_error, mean_absolute_error, mean_squared_log_error, r2_score, median_absolute_error

def metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("\n-- METRICS --")
    
    metrics_list = [mean_squared_error, explained_variance_score, 
              mean_absolute_error, median_absolute_error, mean_squared_log_error, r2_score]
    
    for m in metrics_list:
        try:
            val = m(y_test, y_pred)
            print('{:30}{:5.4f}'.format(m.__name__, val))
        except:
            print('{:30}Cannot be computed'.format(m.__name__, val))

    

### RandomForestRegressor

In [14]:
max_depth = 30

regr_rf = RandomForestRegressor(n_estimators=100, max_depth=max_depth,
                                random_state=2)
regr_rf.fit(X_train, r_train)

metrics(regr_rf, X_test, r_test)

KeyboardInterrupt: 

### Multioutput linear regression

In [18]:
regr_multilin = MultiOutputRegressor(LinearRegression())
regr_multilin.fit(X_train, r_train)

metrics(regr_multilin, X_test, r_test)


-- METRICS --
mean_squared_error            0.0011
explained_variance_score      0.9869
mean_absolute_error           0.0048
median_absolute_error         0.0015
mean_squared_log_error        Cannot be computed
r2_score                      0.9869


### Multioutput RFR

In [21]:
regr_multirfr = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
                                                          max_depth=max_depth,
                                                          random_state=0))

regr_multirfr.fit(X_train, r_train)

metrics(regr_multirfr, X_test, r_test)


-- METRICS --
mean_squared_error            0.0080
explained_variance_score      0.9919
mean_absolute_error           0.0056
median_absolute_error         0.0001
mean_squared_log_error        Cannot be computed
r2_score                      0.9919


## Results analysis

In [19]:
x = X_test.reset_index(drop=True)
y = r_test.reset_index(drop=True)
print(x.shape, y.shape)

# Filter 
f = y[144]> 0
x = x[f]
y = y[f]
print(x.shape, y.shape)

(11850, 143) (11850, 4)
(779, 143) (779, 4)


In [20]:
y.describe()

Unnamed: 0,143,144,145,146
count,779.0,779.0,779.0,779.0
mean,0.489504,1.702182,0.0,0.035813
std,0.071539,1.240339,0.0,0.063673
min,0.3675,1.0,0.0,0.0
25%,0.4305,1.0,0.0,0.0
50%,0.4895,1.0,0.0,0.007778
75%,0.5265,2.0,0.0,0.037222
max,0.7415,9.0,0.0,0.366667


In [22]:
metrics(regr_multilin, x, y)


-- METRICS --
mean_squared_error            0.0067
explained_variance_score      0.9931
mean_absolute_error           0.0204
median_absolute_error         0.0111
mean_squared_log_error        Cannot be computed
r2_score                      0.9923
