# Random Forest Regression for Private data

This notebook creates a random forest regression model for private data.
This model should serve as benchmark for for public RFR + kNN classificaition. Hyper-parameter optimization at the end of the notebook.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import functions that read from  INTERIM format and normalize the data
sys.path.append(os.path.abspath('../../src/data'))
from extract_for_model import extract_time_series
from extract_for_model import scale_time_series_single

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.tree import export_graphviz  

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [2]:
# Import required paths to input files
# Change the file to import if needed
from data_links import soft_prv_IS as input_IS
from data_links import soft_prv_BS as input_BS

In [3]:
# *******************************************************************
# Load selected feautures from INCOME STATEMENTS and BALANCE SHEETS
# Combine the data from these two sourcesinto one dataframe
# *******************************************************************


# Read from INCOME STATMENTS
source = input_IS

feat_IS = ['IQ_EBIT','IQ_TOTAL_OTHER_OPER','IQ_EARNING_CO','IQ_TOTAL_REV','IQ_GP','IQ_OTHER_OPER','IQ_INC_TAX','IQ_NET_INTEREST_EXP']

IS_df = extract_time_series(input_path = source,\
                              features = feat_IS,
                                 steps = 1,\
                              year_min = 2012,\
                              no_shift = False,\
                               no_test = True,\
                             quarterly = True)

# Read from BALANCE SHEETS
feat_BS =  ['IQ_AR','IQ_RE','IQ_TOTAL_ASSETS','IQ_TOTAL_CL','IQ_TOTAL_EQUITY','IQ_TOTAL_LIAB_EQUITY']
source = input_BS
BS_df = extract_time_series(input_path = source,\
                              features = feat_BS,
                                 steps = 1,\
                              year_min = 2012,\
                              no_shift = False,\
                               no_test = True,\
                             quarterly = True)

# Merge dataframe
data_set = IS_df.set_index(['company','year','quarter']).join(BS_df.set_index(['company','year','quarter'])).dropna()

====  extract_time_series metric ====
Size of RAW data: (2801, 11)
Size of RESHAPED data without NA: (1439, 27)
No of companies in RESHAPED data: 199
No of companies with more than 1 datapoint 186
====  extract_time_series metric ====
Size of RAW data: (2802, 9)
Size of RESHAPED data without NA: (1191, 21)
No of companies in RESHAPED data: 149
No of companies with more than 1 datapoint 141


In [4]:
data_set

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,IQ_EBIT-1,IQ_TOTAL_OTHER_OPER-1,IQ_EARNING_CO-1,IQ_TOTAL_REV-1,IQ_GP-1,IQ_OTHER_OPER-1,IQ_INC_TAX-1,IQ_NET_INTEREST_EXP-1,IQ_EBIT+0,IQ_TOTAL_OTHER_OPER+0,...,IQ_TOTAL_ASSETS+0,IQ_TOTAL_CL+0,IQ_TOTAL_EQUITY+0,IQ_TOTAL_LIAB_EQUITY+0,IQ_AR+1,IQ_RE+1,IQ_TOTAL_ASSETS+1,IQ_TOTAL_CL+1,IQ_TOTAL_EQUITY+1,IQ_TOTAL_LIAB_EQUITY+1
company,year,quarter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
IQ102267046,2014.0,Q4,0.088781,1.544846,0.116778,2.962316,1.633627,0.762080,-0.028547,-0.000549,0.161020,2.177063,...,4.908565,3.426283,1.195798,4.908565,5.159870,0.107997,6.973966,5.030611,1.167655,6.973966
IQ102267046,2015.0,Q4,0.161020,2.177063,0.206265,4.305274,2.338084,0.836939,-0.048345,-0.003100,0.018564,3.163827,...,6.973966,5.030611,1.167655,6.973966,9.834407,0.471308,13.360882,10.328728,1.701488,13.360882
IQ102267046,2016.0,Q4,0.018564,3.163827,0.063661,6.537584,3.182391,1.223837,-0.058917,-0.013820,0.331846,5.182979,...,13.360882,10.328728,1.701488,13.360882,11.985600,1.220469,19.541203,15.430752,2.653407,19.541203
IQ106623823,2012.0,Q4,1.926050,3.656914,2.103023,10.964488,5.582964,-0.026461,-0.179953,-0.002980,1.497152,4.386311,...,4.836940,3.021281,1.815659,4.836940,3.287554,4.263869,6.776898,2.493009,4.283889,6.776898
IQ106623823,2014.0,Q4,1.497152,4.386311,1.504518,13.308442,5.883463,-0.014463,-0.007515,-0.000149,5.047800,8.238844,...,9.691305,5.183944,4.131623,9.691305,8.966478,6.675391,15.255244,8.559823,6.695421,15.255244
IQ106623823,2015.0,Q4,5.047800,8.238844,4.435671,28.874548,13.286644,-0.004449,0.598315,-0.013814,10.287455,10.362958,...,15.255244,8.559823,6.695421,15.255244,10.281290,18.720058,24.093039,5.352950,18.740088,24.093039
IQ106623823,2016.0,Q4,10.287455,10.362958,8.165426,38.532112,20.650414,-0.095230,2.103971,-0.018058,12.860130,10.944211,...,24.093039,5.352950,18.740088,24.093039,8.753013,15.480383,20.385597,4.885185,15.500413,20.385597
IQ106623823,2017.0,Q4,12.860130,10.944211,11.955837,47.906470,23.804340,-0.126542,0.903810,-0.000483,9.656718,14.559392,...,20.385597,4.885185,15.500413,20.385597,8.150318,20.124046,26.039179,5.895102,20.144077,26.039179
IQ108539514,2015.0,Q4,-4.697062,7.873389,-4.452902,3.176327,3.176327,-0.044888,-0.347263,-0.103103,-5.156717,8.825083,...,4.214927,2.050475,0.820184,4.214927,0.368286,-29.746332,3.743672,3.658454,0.085218,3.743672
IQ108539514,2016.0,Q4,-5.156717,8.825083,-4.642786,3.668366,3.668366,-0.156615,-0.574327,-0.060396,-5.641239,10.696273,...,3.743672,3.658454,0.085218,3.743672,0.388795,-31.688515,1.516911,3.863248,-4.214742,1.516911


In [5]:
# ****************************
# Calculate percentage changes
# ****************************

for feature in feat_IS+feat_BS:
    data_set[feature+'_PERC+1'] = (data_set[feature+'+1']-data_set[feature+'+0'])/data_set[feature+'+0']
    data_set[feature+'_PERC+0'] = (data_set[feature+'+0']-data_set[feature+'-1'])/data_set[feature+'-1']

data_set=data_set.replace(np.nan, 0)
data_set=data_set.replace(np.inf, 0)
data_set.describe()

Unnamed: 0,IQ_EBIT-1,IQ_TOTAL_OTHER_OPER-1,IQ_EARNING_CO-1,IQ_TOTAL_REV-1,IQ_GP-1,IQ_OTHER_OPER-1,IQ_INC_TAX-1,IQ_NET_INTEREST_EXP-1,IQ_EBIT+0,IQ_TOTAL_OTHER_OPER+0,...,IQ_RE_PERC+1,IQ_RE_PERC+0,IQ_TOTAL_ASSETS_PERC+1,IQ_TOTAL_ASSETS_PERC+0,IQ_TOTAL_CL_PERC+1,IQ_TOTAL_CL_PERC+0,IQ_TOTAL_EQUITY_PERC+1,IQ_TOTAL_EQUITY_PERC+0,IQ_TOTAL_LIAB_EQUITY_PERC+1,IQ_TOTAL_LIAB_EQUITY_PERC+0
count,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,...,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0,393.0
mean,2.436117,13.700068,1.307412,38.998188,16.136184,4.383049,0.549431,-0.097204,2.901559,15.130652,...,1.627749,11.264522,0.205275,0.554357,0.252752,0.60872,0.154423,0.452259,0.205275,0.554357
std,12.738459,24.927098,8.243452,109.577633,33.539058,13.999794,3.255375,2.84733,13.481713,29.010443,...,12.426,195.554577,0.513568,4.032915,0.65684,3.114355,3.312298,4.376862,0.513568,4.032915
min,-32.40335,0.135089,-38.471886,0.179212,0.14894,-4.835851,-6.169369,-23.881874,-17.57276,0.69013,...,-53.443964,-359.051095,-0.848384,-0.651211,-0.861276,-0.839204,-50.458354,-50.458354,-0.848384,-0.651211
25%,-0.163072,3.262925,-0.075083,4.568804,3.346213,0.059519,-0.155232,-0.005021,-0.17028,3.604986,...,-0.207162,-0.301722,-0.025675,-0.024403,-0.072486,-0.072486,-0.064209,-0.06371,-0.025675,-0.024403
50%,0.261072,5.9203,0.301718,9.628278,6.040035,0.807794,0.002637,0.008852,0.311397,7.214753,...,0.111622,0.102028,0.102377,0.111661,0.130652,0.130652,0.091883,0.093393,0.102377,0.111661
75%,1.275034,12.244316,1.195184,20.575626,13.470788,3.371642,0.228912,0.109767,1.572083,13.690802,...,0.549096,0.548538,0.302454,0.321799,0.37344,0.417203,0.29213,0.328499,0.302454,0.321799
max,110.28938,241.38919,64.89781,739.3381,274.14816,163.69472,28.467388,15.652977,113.530655,305.57776,...,180.708839,3840.142857,4.430074,75.694828,5.864289,48.803173,23.531032,53.883537,4.430074,75.694828


In [6]:
# ****************************
# Create filters for data segmentation
# ****************************
res = data_set
target_name = 'IQ_TOTAL_REV+0'

mask1 = res[target_name] < 5
mask2 = (res[target_name] >= 5 ) & (res[target_name] < 10 )
mask3 = (res[target_name] >= 10 ) & (res[target_name] < 50 )
mask4 = (res[target_name] >= 50)

In [7]:
predictors = ['IQ_TOTAL_OTHER_OPER_PERC+0','IQ_TOTAL_REV_PERC+0','IQ_GP_PERC+0',
             'IQ_AR_PERC+0','IQ_RE_PERC+0','IQ_TOTAL_ASSETS_PERC+0', 'IQ_EBIT_PERC+0','IQ_EBIT+0']
target     =  'IQ_EBIT_PERC+1'

In [9]:
masks      = [mask1,mask2,mask3,mask4]
rf = RandomForestRegressor(n_estimators=100,random_state = 0)
columns = ['Segment','MAE','MSE']
segment_summary = pd.DataFrame(columns=columns)

for idx,mask in enumerate(masks):
    segment     = idx+1
    #
    #
    res = data_set[mask].copy()
    X_train, X_test, y_train, y_test = train_test_split(np.array(res[predictors]), np.array(res[target]), test_size=0.33, random_state=42)
    X_previous=X_test[:,7]
    X_train=np.delete(X_train, -1, axis=1)
    X_test=np.delete(X_test, -1,axis=1)
    X_previous
    X_previous=pd.DataFrame(X_previous)  
    #
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    #
    real_val=(pd.DataFrame(y_test+1))*X_previous
    predicted_real=(pd.DataFrame(predictions)+1)*(X_previous)
    #
    export_graphviz(rf.estimators_[0], out_file='tree_from_forest_private_seg_'+str(idx)+'.dot')
    #
    MSE = mse(real_val,predicted_real)
    MAE = mae(real_val,predicted_real)
    row = pd.Series([segment,MAE,MSE],index=columns)
    segment_summary = segment_summary.append(row,ignore_index=True)

In [10]:
segment_summary

Unnamed: 0,Segment,MAE,MSE
0,1.0,0.445999,0.481006
1,2.0,2.693081,35.433159
2,3.0,4.828477,128.786835
3,4.0,10.478514,275.193638


In [11]:
# ****************************
# Hyperparameter optimization
# ****************************
#
#  Varies selected hyper-parameters and collects them to the dictionary
#
n_estimator_range = [100,200,400,800,1600]
criteria = ['mse','mae']
max_depths = [1,3,5,7,9,11]

opti_results ={}
for n_est in n_estimator_range:
    for max_depth in max_depths:
        for criterion in criteria:
            for idx,mask in enumerate(masks):
                segment = idx+1
                #
                #
                res = data_set[mask].copy()
                X_train, X_test, y_train, y_test = train_test_split(np.array(res[predictors]), np.array(res[target]), test_size=0.33, random_state=42)
                X_previous = X_test[:,7]
                X_train = np.delete(X_train, -1, axis=1)
                X_test = np.delete(X_test, -1,axis=1)
                X_previous = pd.DataFrame(X_previous)  
                #
                rf = RandomForestRegressor(n_estimators=n_est,
                                           criterion=criterion,
                                           max_depth=max_depth,
                                           random_state = 0)
                rf.fit(X_train, y_train)
                predictions = rf.predict(X_test)
                #
                real_val=(pd.DataFrame(y_test+1))*X_previous
                predicted_real=(pd.DataFrame(predictions)+1)*(X_previous)
                #
                #export_graphviz(rf.estimators_[0], out_file='tree_from_forest_private_seg_'+str(idx)+'.dot')
                #
                MSE = mse(real_val,predicted_real)
                MAE = mae(real_val,predicted_real)
                row = pd.Series([segment,MAE,MSE])
                #
                key_tuple = (segment,n_est,max_depth,criterion)
                #              
                opti_results[key_tuple] = row
                print("{} : {}".format(key_tuple,row) )             



(1, 100, 1, 'mse') : 0    1.000000
1    0.293306
2    0.185965
dtype: float64
(2, 100, 1, 'mse') : 0     2.000000
1     2.112899
2    22.540247
dtype: float64
(3, 100, 1, 'mse') : 0      3.000000
1      4.056217
2    104.771435
dtype: float64
(4, 100, 1, 'mse') : 0      4.000000
1     10.483961
2    298.255232
dtype: float64
(1, 100, 1, 'mae') : 0    1.000000
1    0.285560
2    0.156004
dtype: float64
(2, 100, 1, 'mae') : 0     2.000000
1     1.916466
2    20.799685
dtype: float64
(3, 100, 1, 'mae') : 0     3.000000
1     3.568355
2    77.724270
dtype: float64
(4, 100, 1, 'mae') : 0      4.000000
1     10.949657
2    267.108637
dtype: float64
(1, 100, 3, 'mse') : 0    1.000000
1    0.429558
2    0.450072
dtype: float64
(2, 100, 3, 'mse') : 0     2.000000
1     2.447933
2    27.961026
dtype: float64
(3, 100, 3, 'mse') : 0      3.000000
1      4.269334
2    116.580103
dtype: float64
(4, 100, 3, 'mse') : 0      4.000000
1     10.687665
2    265.706241
dtype: float64
(1, 100, 3, 'mae') : 0

(2, 400, 1, 'mae') : 0     2.000000
1     1.863631
2    19.408668
dtype: float64
(3, 400, 1, 'mae') : 0     3.000000
1     3.366094
2    70.153618
dtype: float64
(4, 400, 1, 'mae') : 0      4.000000
1     10.900419
2    262.966485
dtype: float64
(1, 400, 3, 'mse') : 0    1.000000
1    0.383187
2    0.298387
dtype: float64
(2, 400, 3, 'mse') : 0     2.000000
1     2.394869
2    26.557416
dtype: float64
(3, 400, 3, 'mse') : 0      3.000000
1      4.241123
2    107.525493
dtype: float64
(4, 400, 3, 'mse') : 0      4.000000
1     10.508403
2    262.702237
dtype: float64
(1, 400, 3, 'mae') : 0    1.000000
1    0.404150
2    0.312978
dtype: float64
(2, 400, 3, 'mae') : 0     2.000000
1     2.201253
2    23.622599
dtype: float64
(3, 400, 3, 'mae') : 0     3.000000
1     3.796658
2    93.700221
dtype: float64
(4, 400, 3, 'mae') : 0      4.000000
1     11.200872
2    288.301873
dtype: float64
(1, 400, 5, 'mse') : 0    1.000000
1    0.413033
2    0.355346
dtype: float64
(2, 400, 5, 'mse') : 0   

(3, 1600, 3, 'mse') : 0      3.000000
1      4.276445
2    108.500333
dtype: float64
(4, 1600, 3, 'mse') : 0      4.000000
1     10.231008
2    256.002539
dtype: float64
(1, 1600, 3, 'mae') : 0    1.000000
1    0.417728
2    0.360635
dtype: float64
(2, 1600, 3, 'mae') : 0     2.000000
1     2.150802
2    23.402632
dtype: float64
(3, 1600, 3, 'mae') : 0     3.000000
1     3.857516
2    97.533148
dtype: float64
(4, 1600, 3, 'mae') : 0      4.000000
1     10.865509
2    276.613398
dtype: float64
(1, 1600, 5, 'mse') : 0    1.000000
1    0.430707
2    0.387479
dtype: float64
(2, 1600, 5, 'mse') : 0     2.000000
1     2.464223
2    30.623678
dtype: float64
(3, 1600, 5, 'mse') : 0      3.000000
1      4.493946
2    113.118356
dtype: float64
(4, 1600, 5, 'mse') : 0      4.000000
1      9.691063
2    247.613884
dtype: float64
(1, 1600, 5, 'mae') : 0    1.000000
1    0.455544
2    0.455719
dtype: float64
(2, 1600, 5, 'mae') : 0     2.000000
1     2.355579
2    28.559675
dtype: float64
(3, 1600, 

In [12]:
# Print the results sorted by error for each segment:
columns     = ['Segment','Hyper','MAE','MSE']
opti_sum_pd = pd.DataFrame(columns=columns)

for k,v in sorted(opti_results.items(),key = lambda kv: (kv[1][1],kv[1][2])):
    row = pd.Series([k[0],k[1:],v[1],v[2]],index = columns)
    opti_sum_pd = opti_sum_pd.append(row,ignore_index=True)
    
for segment in opti_sum_pd['Segment'].unique():
    print(opti_sum_pd[opti_sum_pd['Segment']==segment])
    

   Segment            Hyper       MAE       MSE
0        1    (400, 1, mse)  0.277409  0.165287
1        1    (100, 1, mae)  0.285560  0.156004
2        1    (800, 1, mae)  0.289899  0.165105
3        1    (400, 1, mae)  0.290883  0.164601
4        1   (1600, 1, mae)  0.290950  0.169135
5        1    (200, 1, mae)  0.292372  0.165363
6        1    (100, 1, mse)  0.293306  0.185965
7        1    (800, 1, mse)  0.294658  0.176796
8        1    (200, 1, mse)  0.295789  0.174024
9        1   (1600, 1, mse)  0.301524  0.186038
10       1    (100, 3, mae)  0.342828  0.214507
11       1    (100, 5, mae)  0.352426  0.217747
12       1    (100, 9, mae)  0.359130  0.230076
13       1    (100, 7, mae)  0.362947  0.229375
14       1    (200, 3, mae)  0.371233  0.245949
15       1   (100, 11, mae)  0.376701  0.223139
16       1    (400, 3, mse)  0.383187  0.298387
17       1    (800, 3, mse)  0.385462  0.316431
18       1   (1600, 3, mse)  0.398367  0.352796
19       1   (400, 11, mse)  0.400469  0