# Random Forest with Hyperparameter Tuning and Features Seclection

In [1]:
import time

import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [4]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [6]:
# print runtime versions
# Python version
sys.version

'3.8.10 (default, Jun  2 2021, 10:49:15) \n[GCC 9.4.0]'

In [7]:
# Spark version
spark.version

'3.1.2'

### Cleaning Data

In [8]:
import databricks.koalas as ks

In [9]:
df = ks.read_csv('data/AF_Stock_TW_2603.TW.csv')

In [10]:
df = df.drop('key_0', axis='columns')

In [11]:
tar = ks.DataFrame(df['close'].shift(periods=20).rename('label'))

In [12]:
ks.set_option('compute.ops_on_diff_frames', True)
df = ks.concat([df, tar], axis=1)

In [13]:
# 保留前 100 row 空值小於等於 60% 的 column
cond = df.iloc[:101,:].isnull().sum()/100 <= 0.6
df = df[cond[cond == True].index.to_numpy()]

In [14]:
# 看 58 row 以後的空值
df.iloc[58:,:].isnull().any().sum()

0

In [15]:
# 取 58 row 以後當新的 df # index reset
df = df.iloc[58:,:].reset_index(drop=True)

In [16]:
df.shape

(2508, 184)

In [17]:
sdf = df.to_spark()

In [18]:
colTrain = list(df.columns[:-1])

In [19]:
# Vector assembled set of features 
# (assemble only the columns you want to MinMaxScale)
assembler = VectorAssembler(inputCols=colTrain, outputCol="features")
output = assembler.transform(sdf)

In [20]:
# select data for building model
model_df = output.select(['features','label'])

In [21]:
model_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)



### Split Data - Train & Test sets

In [22]:
# use Random Forest to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [23]:
train_df.count(), len(train_df.columns)

(1741, 2)

In [24]:
test_df.count(), len(test_df.columns)

(767, 2)

### Build Random Forest Model

In [25]:
rf = RandomForestRegressor()
rf_model = rf.fit(train_df)

In [26]:
# predict on the test set
model_predictions = rf_model.transform(test_df)

In [27]:
# print prediction
model_predictions.show(10)

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[19.2769391502943...|19.276939392089844| 19.18613926767244|
|[19.6624788274294...|18.660076141357425| 19.10773054039324|
|[19.8937997357087...|19.276939392089844| 19.18613926767244|
|[19.8937998275340...|17.619121551513672|19.019794481807274|
|[20.4335541213312...|18.852846145629883| 19.72018766766709|
|[21.2046327145946...|19.778139114379883|20.249421125173686|
|[21.2817385819022...|19.084169387817383| 19.78803131285374|
|[21.5901708137132...| 19.35404586791992|20.057975469364557|
|(183,[0,1,2,3,4,5...| 19.97090721130371|20.249421125173686|
|[19.4311537671171...|20.317893981933597| 20.30185747590754|
+--------------------+------------------+------------------+
only showing top 10 rows



### Evaluate Model

In [28]:
evaluatorRMSE = RegressionEvaluator().setLabelCol('label').setPredictionCol("prediction").setMetricName("rmse")

In [29]:
RMSE = evaluatorRMSE.evaluate(model_predictions)
RMSE

1.282201095799959

In [30]:
evaluatorR2 = RegressionEvaluator().setLabelCol('label').setPredictionCol("prediction").setMetricName("r2")

In [31]:
R2 = evaluatorR2.evaluate(model_predictions)
R2

0.9788788645820307

### Hyperparameter Tuning

In [32]:
evaluator = RegressionEvaluator()
rf = RandomForestRegressor()

In [33]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10,20])
             .addGrid(rf.maxBins, [20,30])
             .addGrid(rf.numTrees, [5])
             .build())

In [34]:
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=5)

In [35]:
cv_model = cv.fit(train_df)

In [36]:
best_rf_model = cv_model.bestModel

In [37]:
f'Best Param(maxDepth): {best_rf_model._java_obj.getMaxDepth()}'

'Best Param(maxDepth): 20'

In [38]:
f'Best Param(maxBins): {best_rf_model._java_obj.getMaxBins()}'

'Best Param(maxBins): 30'

In [39]:
best_rf_model

RandomForestRegressionModel: uid=RandomForestRegressor_4e4168c47f84, numTrees=5, numFeatures=183

### Evaluate Tuned Model

In [40]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [41]:
RMSE = evaluatorRMSE.evaluate(model_predictions)
RMSE

0.8749532201014778

In [42]:
R2 = evaluatorR2.evaluate(model_predictions)
R2

0.9901649937263322

In [48]:
# FEATURE IMPORTANCES
best_rf_model.featureImportances

SparseVector(183, {0: 0.0, 1: 0.0001, 2: 0.0, 3: 0.0, 4: 0.0003, 5: 0.0043, 6: 0.0001, 7: 0.0007, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.1355, 12: 0.0, 13: 0.0002, 14: 0.0, 15: 0.0002, 16: 0.1652, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.015, 22: 0.0, 23: 0.0, 24: 0.0011, 25: 0.0, 26: 0.0, 27: 0.0049, 29: 0.0, 30: 0.0003, 33: 0.0003, 35: 0.0142, 36: 0.001, 37: 0.0014, 38: 0.0, 39: 0.0, 40: 0.0006, 41: 0.0, 42: 0.0, 43: 0.0, 44: 0.0001, 45: 0.0001, 46: 0.0001, 47: 0.0, 48: 0.0007, 49: 0.0002, 50: 0.0001, 51: 0.0002, 52: 0.0002, 53: 0.0, 54: 0.0, 55: 0.0002, 56: 0.0001, 57: 0.0, 58: 0.0, 59: 0.0001, 60: 0.0008, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 65: 0.0013, 66: 0.0001, 67: 0.0002, 68: 0.0, 69: 0.0, 70: 0.0, 71: 0.0007, 72: 0.0001, 73: 0.0, 74: 0.0, 75: 0.0, 76: 0.0, 77: 0.0, 78: 0.0, 79: 0.0003, 80: 0.2938, 81: 0.0, 82: 0.0007, 83: 0.0, 84: 0.0, 85: 0.0001, 86: 0.0003, 87: 0.1565, 88: 0.0, 89: 0.045, 90: 0.0, 93: 0.0, 95: 0.0, 100: 0.0, 102: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0003, 108:

In [51]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = ks.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [54]:
ExtractFeatureImp(best_rf_model.featureImportances, model_df, "features").head(10)



Unnamed: 0,idx,name,score
80,80,ma,0.293831
16,16,max_y,0.165241
87,87,sma,0.156475
11,11,max_x,0.135539
174,174,weekofyear,0.073459
89,89,trima,0.044998
175,175,dayofyear,0.026751
171,171,year,0.016796
21,21,sum,0.014963
35,35,adx,0.014246


In [None]:
# Feature Selection Using Feature Importance Score - Creating a PySpark Estimator
# https://www.timlrx.com/blog/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator