In [1]:
import time
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.regression import GBTRegressor

In [3]:
def get_spark_session(worker_cores=1):
    """
    - Initialize a new Spark session with a dynamic number of worker threads.
    - Before creating a new session, stop the old one (if it exists).
    - Return the SparkSession object.
    """
    global spark

    if 'spark' in globals():
        spark.stop()

    spark = (SparkSession.builder
             .appName("spark_model_training")
             .master(f"local[{worker_cores}]") 
             .getOrCreate())
    
    return spark

def load_data(spark, file_path):
    """
    Load data from a Parquet file using Spark.
    """
    df = spark.read.parquet(file_path)
    return df

def preprocess_data(df, feature_columns):
    """
    Preprocess the data by assembling features into a single column.
    """
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
    df = assembler.transform(df)
    return df



def train_gbt_with_cv(processed_df):
    """
    Train a Gradient-Boosted Tree Regressor using Spark's GBTRegressor.
    """
    
    # sample 50% of the data as the kernel was getting killed due to memory issues
    sampled_df = processed_df.sample(fraction=0.5, seed=42)

    # Using  GBTRegressor model
    gbt = GBTRegressor(featuresCol='features',
                       labelCol='Impact')
    
    # Using MAE instead of MAPE as MAPE was not available in RegressionEvaluator.
    evaluator = RegressionEvaluator(labelCol='Impact', predictionCol='prediction', metricName='mae')

    # Defining single set of parameters for the model as parameter tuning was not causing memory issues.
    paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [6])  # Maximum depth of each tree
             .addGrid(gbt.maxIter, [500])  # Number of boosting iterations
             .addGrid(gbt.stepSize, [0.1])  # Learning rate (step size)
             .addGrid(gbt.subsamplingRate, [0.7])  # Subsampling rate (for each tree)
             .addGrid(gbt.featureSubsetStrategy, [ 'onethird'])  # Subsample features
             .build())

    crossval = CrossValidator(estimator=gbt,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=3
                              )  


    start_time = time.time()

    cv_model = crossval.fit(sampled_df)

    # Calculate training time
    total_training_time = time.time() - start_time

   
    avg_mae_cv = np.mean(cv_model.avgMetrics)
    

    
    return avg_mae_cv, total_training_time

def run_experiment(worker_cores, file_path):
    """
    Run the experiment with the specified number of worker cores and file path.
    """
    spark = get_spark_session(worker_cores)
    print("worker cores: ", spark.sparkContext.defaultParallelism)
    df = load_data(spark, file_path).drop('main_author_encoded')
    df = df.filter(df.Impact != 0)
    df = df.drop('__index_level_0__')
    total_features = df.columns
    total_features.remove('Impact')
    
    preprocessed_df = preprocess_data(df, total_features)
    
    avg_mae_cv, total_training_time = train_gbt_with_cv(preprocessed_df)
    
    return avg_mae_cv, total_training_time



In [4]:


file_path = "data_preprocessed.parquet"



worker_configs = [1, 2, 4]
results = {}

# Run the experiment for each number of workers

for workers in worker_configs:
    avg_mae_cv, total_training_time = run_experiment(workers, file_path)
    results[workers] = (avg_mae_cv, total_training_time)
    print(f"Workers: {workers}, MAE: {avg_mae_cv}, Training Time: {total_training_time}")

print("Experiment Results:", results)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 18:00:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


worker cores:  1


24/10/02 18:00:20 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/10/02 18:00:28 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/10/02 18:00:45 WARN DAGScheduler: Broadcasting large task binary with size 1000.0 KiB
24/10/02 18:00:45 WARN DAGScheduler: Broadcasting large task binary with size 1000.3 KiB
24/10/02 18:00:45 WARN DAGScheduler: Broadcasting large task binary with size 1001.2 KiB
24/10/02 18:00:46 WARN DAGScheduler: Broadcasting large task binary with size 1002.7 KiB
24/10/02 18:00:46 WARN DAGScheduler: Broadcasting large task binary with size 1005.8 KiB
24/10/02 18:00:46 WARN DAGScheduler: Broadcasting large task binary with size 1011.8 KiB
24/10/02

Workers: 1, MAE: 46.39529514276564, Training Time: 1184.555340051651
worker cores:  2


24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1000.2 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1000.6 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1001.5 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1002.9 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1006.0 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1012.1 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1012.0 KiB
24/10/02 18:20:29 WARN DAGScheduler: Broadcasting large task binary with size 1012.5 KiB
24/10/02 18:20:30 WARN DAGScheduler: Broadcasting large task binary with size 1013.5 KiB
24/10/02 18:20:30 WARN DAGScheduler: Broadcasting large task binary with size 1014.9 KiB
24/10/02 18:20:30 WARN DAGScheduler: Broadcasting large task binary with size 1018.0 KiB
24/10/02 18:20:30 WAR

Workers: 2, MAE: 46.39529514276564, Training Time: 1356.3587908744812
worker cores:  4


24/10/02 18:43:05 WARN DAGScheduler: Broadcasting large task binary with size 1000.6 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1006.8 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1006.7 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1007.3 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1008.2 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1009.6 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1012.7 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1018.9 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1019.0 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1019.6 KiB
24/10/02 18:43:06 WARN DAGScheduler: Broadcasting large task binary with size 1020.5 KiB
24/10/02 18:43:06 WAR

Workers: 4, MAE: 46.59486072405618, Training Time: 1548.2583179473877
Experiment Results: {1: (46.39529514276564, 1184.555340051651), 2: (46.39529514276564, 1356.3587908744812), 4: (46.59486072405618, 1548.2583179473877)}


24/10/02 19:08:30 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB


In [5]:
print("Experiment Results:", results)

Experiment Results: {1: (46.39529514276564, 1184.555340051651), 2: (46.39529514276564, 1356.3587908744812), 4: (46.59486072405618, 1548.2583179473877)}
