In [1]:
from pyspark.sql import *
from pyspark.ml import *

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml import Pipeline

from builtins import round

from user_definition import *



In [2]:
ss = SparkSession.builder.config("spark.driver.memory", "15g")\
                         .config("spark.executor.memory", "8g")\
                         .getOrCreate()
sc = ss.sparkContext

# 1. Read training and validation dataset

In [7]:
df_train = ss.read.parquet(train_folder).repartition(200).cache()

df_valid = ss.read.parquet(valid_folder).repartition(200).cache()

print(df_train.count(), end = '\n\n')

print(df_valid.count(), end = '\n\n')

4706083

1177746



# 2. Apply random forest classifier

In [8]:
bceval = BinaryClassificationEvaluator()

rf = RandomForestClassifier()

cv = CrossValidator().setEstimator(rf).setEvaluator(bceval).setNumFolds(5)

paraGrid = ParamGridBuilder().addGrid(rf.numTrees, num_trees).build()

cv.setEstimatorParamMaps(paraGrid)

rf_cv_model = cv.fit(df_train)

roc = bceval.evaluate(rf_cv_model.bestModel.transform(df_valid))

print(str(rf).split('_')[0])
print(rf_cv_model.bestModel.getNumTrees)
print(f'{roc:.{n_digits}f}')

RandomForestClassifier
100
0.799


# 3. Appy a gradient boosted tree classifier

In [9]:
gbt = GBTClassifier()

cv_3 = CrossValidator().setEstimator(gbt).setEvaluator(bceval).setNumFolds(n_fold)

paraGrid = ParamGridBuilder().addGrid(gbt.maxDepth, max_depth).build()

cv_3.setEstimatorParamMaps(paraGrid)

gbt_cv_model = cv_3.fit(df_train)

roc = bceval.evaluate(gbt_cv_model.bestModel.transform(df_valid))

print(str(gbt).split('_')[0])

print(gbt_cv_model.bestModel.getMaxDepth())

print(f'{roc:.{n_digits}f}')

GBTClassifier
10
0.931


In [8]:
ss.stop()