### **Import Libraries**

In [0]:
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, MinMaxScaler, Imputer
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import os


### Connect to the storage

In [0]:
 # Azure storage access info
blob_account_name = "bgupb202402juanbarriento"
blob_container_name = "marketplace"
blob_sas_token = 'sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2024-12-01T01:56:16Z&st=2024-10-16T17:56:16Z&spr=https&sig=cM7vK4vvrL9YH31ZYI%2BX%2BmssMUTdRieP%2BMGcVNPoagA%3D'

# Allow SPARK to read from Blob remotely
wasbs_path = f'wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/'
spark.conf.set( f'fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net' , blob_sas_token)
print('Remote blob path: ' + wasbs_path)

Remote blob path: wasbs://marketplace@bgupb202402juanbarriento.blob.core.windows.net/


### Function: get_train_validation_data

The function `get_train_validation_data` splits a PySpark DataFrame into three datasets: **training**, **validation**, and **test**.


In [0]:
#Split function
def get_train_validation_data(df, seed_value = 2024, splits=[0.8,0.1,0.1]):
    df_splits = df.randomSplit(splits, seed=seed_value)

    return df_splits

### Read the features file

In [0]:
#Read Dataframe from features layer
features_df = spark.read.format("delta").load(f"{wasbs_path}features/features_df").drop("user_id")

display(features_df.head(50))

year,month,target,session_per_user_year_month,session_duration,avg_cart_abandone_rate,mean_price,convertion_rate
2020,2,0,1,276,0.75,372.67,0.1666666666666666
2020,4,0,3,1984968,0.6333333333333333,209.75,0.3076923076923077
2020,2,0,3,138781,0.6666666666666666,1209.27,0.25
2020,4,1,2,724,0.3333333333333333,239.415,0.25
2020,4,0,4,1433147,0.25,200.64,0.1333333333333333
2020,4,0,15,1271339,0.5,46.31,0.0181818181818181
2020,4,0,7,1640083,0.3333333333333333,900.9,0.0769230769230769
2020,2,0,27,353354,0.6666666666666666,539.5,0.0333333333333333
2020,2,0,5,185419,-0.0666666666666666,268.574,0.8333333333333334
2020,4,0,3,261411,0.5714285714285714,81.33333333333333,0.0566037735849056


In [0]:
# Calculate the percentage of null values in each column to see if imputation is possible
total_rows = features_df.count()

null_percentages = features_df.select([
    ((count(when(col(c).isNull(), c)) / total_rows) * 100
     ).alias(f'{c}_null_percentage')
    for c in features_df.columns
])

null_percentages.show()

+--------------------+---------------------+----------------------+-------------------------------------------+--------------------------------+--------------------------------------+--------------------------+-------------------------------+
|year_null_percentage|month_null_percentage|target_null_percentage|session_per_user_year_month_null_percentage|session_duration_null_percentage|avg_cart_abandone_rate_null_percentage|mean_price_null_percentage|convertion_rate_null_percentage|
+--------------------+---------------------+----------------------+-------------------------------------------+--------------------------------+--------------------------------------+--------------------------+-------------------------------+
|                 0.0|                  0.0|                   0.0|                                        0.0|                             0.0|                     5.545206817574712|                       0.0|             0.3846463892871199|
+--------------------+------

The percentage of nulls is not above 10%, so it is possible to impute the missing values 

In [0]:
# To see the mean and the stddev of the numerical columns to see if standardization is required
display(features_df.describe())

summary,year,month,target,session_per_user_year_month,session_duration,avg_cart_abandone_rate,mean_price,convertion_rate
count,2530636.0,2530636.0,2530636.0,2530636.0,2530636.0,2390307.0,2530636.0,2520902.0
mean,2019.589545473944,6.129528703456364,0.1274134249255918,6.746052770923989,843492.1126720713,0.3343035963086783,308.54116379957253,0.201038263664188
stddev,0.4919163584564385,4.32670786183029,0.3334355829956678,16.304288814472443,819985.7042185947,0.362526655923931,317.7891405053492,0.2389409526576253
min,2019.0,1.0,0.0,1.0,0.0,-11.0,0.42,8.869179600886918e-05
max,2020.0,12.0,1.0,9002.0,2676683.0,1.0,2574.07,28.0


Because some columns have a large standard deviation, I believe standardization would be appropriate

###Data Preparation

In [0]:
#Specify numerical and the categorical variables for the encoder
continuous_cols = [
  'session_per_user_year_month', 'session_duration', 
  'avg_cart_abandone_rate','mean_price','convertion_rate'
  ]

ohe_numeric_cols  = ['year', 'month']

In [0]:
# inputs and outputs for the encoder and the imputer
ohe_input_cols = ohe_numeric_cols
ohe_output_cols = [f"{col_name}_ohe" for col_name in ohe_numeric_cols]

continuous_input_cols = continuous_cols
continuous_output_cols = [f"{col}_imputed" for col in continuous_cols]
continuous_output_scaler_cols = [f"{col}_scaled" for col in continuous_cols]


In [0]:
# Create the encoder and the imputer
ohe = OneHotEncoder(inputCols=ohe_input_cols,
                     outputCols=ohe_output_cols)
                     
num_imputer = Imputer(inputCols=continuous_input_cols, 
                      outputCols=continuous_output_cols)


In [0]:
# vectorizer for the numeric columns
vectorizer_num = VectorAssembler(inputCols=continuous_output_cols, 
                                 outputCol="num_features",handleInvalid="keep")
                                 
scaler = MinMaxScaler(inputCol="num_features", 
                      outputCol="scaled_features")

In [0]:
# unified the numeric and categorical columns
feature_cols = continuous_output_cols + ohe_output_cols
feature_cols

['session_per_user_year_month_imputed',
 'session_duration_imputed',
 'avg_cart_abandone_rate_imputed',
 'mean_price_imputed',
 'convertion_rate_imputed',
 'year_ohe',
 'month_ohe']

In [0]:
#vectorizer for all the data
final_vectorizer = VectorAssembler(inputCols=["scaled_features"] + ohe_output_cols, outputCol="features")

#### Data Splits

In [0]:
#Divide the dataset in 3 parts: train, test, validation

train_df, test_df, validation_df = get_train_validation_data(features_df)

In [0]:
#Check the size of the datasets

print(train_df.count())
print(test_df.count())
print(validation_df.count())

2025297
252425
252914


In [0]:
display(train_df)

In [0]:
train_df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- session_per_user_year_month: long (nullable = true)
 |-- session_duration: long (nullable = true)
 |-- avg_cart_abandone_rate: double (nullable = true)
 |-- mean_price: double (nullable = true)
 |-- convertion_rate: double (nullable = true)



##Pipeline and Transformation

In [0]:
# Create the pipeline and fit the model on the training data
feature_pipelene = Pipeline(stages=[ohe, num_imputer, vectorizer_num,scaler,final_vectorizer])

feature_model = feature_pipelene.fit(train_df)
train_transform_df = feature_model.transform(train_df)

In [0]:
display(train_transform_df)

In [0]:
# Put the dataframe on Cache for faster training
train_transform_df = train_transform_df.cache()

In [0]:
#Some precautions because of some problems with the training of the models
train_transform_df = train_transform_df.withColumn("target", col("target").cast("double"))

In [0]:
train_transform_df.select("target").distinct().show()

+------+
|target|
+------+
|   0.0|
|   1.0|
+------+



# Modeling

###Support Vector Machine

In [0]:
from pyspark.ml.classification import LinearSVC

# Define the model of Support Vector Machine
svm = LinearSVC(labelCol="target", featuresCol="features", maxIter=10)

# Train
svm_model = svm.fit(train_transform_df)

# Predict over the train dataset
predictionssvc = svm_model.transform(train_transform_df)

####Evaluation

In [0]:
svm_full_model= feature_model.copy()

In [0]:
svm_full_model.stages.append(svm_model)

In [0]:
# run the svc model on validation data
validation_scored_svm = svm_full_model.transform(validation_df)

In [0]:
display(validation_scored_svm)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(validation_scored_svm)
print(f"Accuracy: {accuracy}")

# F1
evaluator.setMetricName("f1")
f1_score = evaluator.evaluate(validation_scored_svm)
print(f"F1 Score: {f1_score}")

# precision
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(validation_scored_svm)
print(f"Precision: {precision}")

# recall
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(validation_scored_svm)
print(f"Recall: {recall}")

#auc
print(f"Area under ROC: {auc}")

Accuracy: 0.8724348988193615
F1 Score: 0.8130546932952187
Precision: 0.8121987440206856
Recall: 0.8724348988193615
Area under ROC: 0.7873422545265054


### Gradient Boosted Tree

In [0]:
# Define the model of Gradient Boosting
gbt = GBTClassifier(labelCol="target", featuresCol="features", maxIter=10)

# Train
gbt_model = gbt.fit(train_transform_df)

# Predict over the train dataset
predictions = gbt_model.transform(train_transform_df)

# Evaluate the model with AUC
evaluator = BinaryClassificationEvaluator(labelCol="target")
auc = evaluator.evaluate(predictions)
print("AUC: ", auc)

####Evaluation

In [0]:
gbt_full_model= feature_model.copy()

In [0]:
gbt_full_model.stages.append(gbt_model)

In [0]:
# run the model on validation data
validation_scored_gbt = gbt_full_model.transform(validation_df)

In [0]:
display(validation_scored_gbt)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(validation_scored_gbt)
print(f"Accuracy: {accuracy}")

# F1
evaluator.setMetricName("f1")
f1_score = evaluator.evaluate(validation_scored_gbt)
print(f"F1 Score: {f1_score}")

# precision
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(validation_scored_gbt)
print(f"Precision: {precision}")

# recall
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(validation_scored_gbt)
print(f"Recall: {recall}")

#auc
print(f"Area under ROC: {auc}")

Accuracy: 0.8745146571561875
F1 Score: 0.827327073775794
Precision: 0.8402315334742553
Recall: 0.8745146571561875
Area under ROC: 0.7873422545265054


### Hyperparameter Tuning

In [0]:
# Define the model of Gradient Boosting
gbt = GBTClassifier(labelCol="target", featuresCol="features")

# Definir el evaluador (usamos AUC para clasificación binaria)
evaluator = BinaryClassificationEvaluator(labelCol="target")

# Build the parameter for the grid search
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxIter, [1, 5, 10])  # Number of iterations 
             .addGrid(gbt.maxDepth, [5, 10]) #Maximun depth of the trees 
             .build())

# CrossValidation with the grid search
cv = CrossValidator(estimator=gbt,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3,  # Number of folds for cross-validation
                    seed=123)  # seed

# Train the model with CrossValidator
cvModel = cv.fit(train_transform_df)

# Get the best model
bestModel = cvModel.bestModel

# Make predictions on the train dataset
predictions = bestModel.transform(train_transform_df)

# Evaluar el mejor modelo con AUC with the train dataset
auc = evaluator.evaluate(predictions)
print("Mejor AUC: ", auc)

# Check the parameters of the best model
print("Mejores parámetros: ", cvModel.bestModel.extractParamMap())

###Evaluation

In [0]:
gbt_grid_full_model= feature_model.copy()

In [0]:
gbt_grid_full_model.stages.append(bestModel)

In [0]:

# run the model on validation data
validation_scored_gbt_grid = gbt_grid_full_model.transform(validation_df)

In [0]:
display(validation_scored_gbt_grid)

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(validation_scored_gbt_grid)
print(f"Accuracy: {accuracy}")

# F1
evaluator.setMetricName("f1")
f1_score = evaluator.evaluate(validation_scored_gbt_grid)
print(f"F1 Score: {f1_score}")

# precision
evaluator.setMetricName("weightedPrecision")
precision = evaluator.evaluate(validation_scored_gbt_grid)
print(f"Precision: {precision}")

# recall
evaluator.setMetricName("weightedRecall")
recall = evaluator.evaluate(validation_scored_gbt_grid)
print(f"Recall: {recall}")

#auc
print(f"Area under ROC: {auc}")

Accuracy: 0.8747597997738361
F1 Score: 0.8322117004034857
Precision: 0.8400107538515055
Recall: 0.8747597997738362
Area under ROC: 0.7873422545265054


### Save the models

In [0]:
# Save the svm model in the container
svm_model.write().overwrite().save(f"{wasbs_path}models/svm_model_month_sales_growth_prediction")

In [0]:
# Save the gbt model in the container
gbt_model.write().overwrite().save(f"{wasbs_path}models/gbt_model_month_sales_growth_prediction")

In [0]:
# Save the gbt model from the grid search in the container
bestModel.write().overwrite().save(f"{wasbs_path}models/gbt_model_grid_month_sales_growth_prediction")