In [61]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as fn
from pyspark.conf import SparkConf

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import MinMaxScaler

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [62]:
#Read Data From Postgres using PySpark
sc = SparkSession.builder\
    .config("spark.jars", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.driver.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.executor.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .getOrCreate()

sqlContext = SQLContext(sc)

city_distances = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "city_distances") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()

flights = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "flights") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()


products = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "products") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()

requests = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "requests") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()

travels = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "travels") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()

users = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/bdm_joint") \
    .option("dbtable", "users") \
    .option("user", "bdm") \
    .option("password", "test123") \
    .option("driver", "org.postgresql.Driver") \
    .load()



## 1. Exploration

In [63]:
city_distances.printSchema()

root
 |-- country1: string (nullable = true)
 |-- name1: string (nullable = true)
 |-- country2: string (nullable = true)
 |-- name2: string (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- lpg_price: double (nullable = true)
 |-- diesel_price: double (nullable = true)
 |-- gasoline_price: double (nullable = true)



In [64]:
city_distances.take(1)

[Row(country1='Spain', name1='Madrid', country2='Spain', name2='Alacant', distance_km=360.1276166, currency='euro', lpg_price=28.03233368, diesel_price=27.72982648, gasoline_price=36.82268867)]

### 1.1 Show those tables in pandas Dataframe format

In [55]:
city_distances.toPandas()

Unnamed: 0,country1,name1,country2,name2,distance_km,currency,lpg_price,diesel_price,gasoline_price
0,Spain,Madrid,Spain,Alacant,360.127617,euro,28.032334,27.729826,36.822689
1,Spain,Madrid,Spain,M谩laga,414.957672,euro,32.300305,31.951741,42.429007
2,Spain,Madrid,Spain,Sevilla,390.504192,euro,30.396846,30.068823,39.928663
3,Spain,Madrid,Spain,Palma,550.581189,euro,42.85724,42.394752,56.296376
4,Spain,Madrid,Spain,Barcelona,506.922016,euro,39.45881,39.032995,51.832269
5,Spain,Alacant,Spain,M谩laga,391.476854,euro,30.472558,30.143718,40.028117
6,Spain,Alacant,Spain,Sevilla,495.95999,euro,38.605526,38.188919,50.711413
7,Spain,Alacant,Spain,Palma,304.141921,euro,23.674407,23.418928,31.098207
8,Spain,Alacant,Spain,Barcelona,407.25238,euro,31.700525,31.358433,41.641149
9,Spain,M谩laga,Spain,Sevilla,158.35812,euro,12.326596,12.193575,16.191959


In [56]:
flights.toPandas()

Unnamed: 0,flightNumber,departureAirportFsCode,arrivalAirportFsCode,departureTime,arrivalTime,stops,departureTerminal,arrivalTerminal
0,1007,MAD,BCN,2023-04-23 18:25:00,2023-04-23 19:45:00,0,4.0,1.0
1,5005,MAD,BCN,2023-04-23 18:25:00,2023-04-23 19:45:00,0,4.0,1.0
2,3546,MAD,BCN,2023-04-23 18:25:00,2023-04-23 19:45:00,0,4.0,1.0
3,7703,MAD,BCN,2023-04-23 15:10:00,2023-04-23 16:35:00,0,2.0,1.0
4,9459,MAD,BCN,2023-04-23 15:10:00,2023-04-23 16:35:00,0,2.0,1.0
...,...,...,...,...,...,...,...,...
3929,3150,AGP,PMI,2023-04-28 21:05:00,2023-04-28 22:35:00,0,,
3930,5259,AGP,PMI,2023-04-28 21:05:00,2023-04-28 22:35:00,0,,
3931,5677,AGP,PMI,2023-04-29 21:35:00,2023-04-29 23:05:00,0,,
3932,3148,AGP,PMI,2023-04-29 13:35:00,2023-04-29 15:05:00,0,,


In [57]:
products.toPandas()

Unnamed: 0,product_id,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,product_name
0,1e9e8ef04dbcff4541ed26657ea517e5,225,16,10,14,perfumery,Acqua di Gio by Giorgio Armani
1,6a2fb4dd53d2cdb88e0432f1284a004c,400,27,5,20,perfumery,Alien by Thierry Mugler
2,0d009643171aee696f4733340bc2fdd0,422,21,16,18,perfumery,Black Opium by Yves Saint Laurent
3,b1eae565a61935e0011ee7682fef9dc9,267,17,13,17,perfumery,Bleu de Chanel by Chanel
4,8da90b37f0fb171b4877c124f965b1f6,377,18,13,15,perfumery,Bright Crystal by Versace
...,...,...,...,...,...,...,...
120,91e6a181c1ebe734ab459822cee89763,850,48,16,25,watches_gifts,Tissot T-Touch
121,af8215adef2df1e9e66642f159e81e55,400,20,11,19,watches_gifts,Casio G-Shock
122,35255921c732e0e9319097469d856cd5,400,18,11,15,watches_gifts,Timex Weekender
123,25a675783383c8e0aa707b7e9063c925,253,18,11,13,watches_gifts,Fossil Grant


In [58]:
requests.toPandas()

Unnamed: 0,requestId,initializationUserId,collectionUserId,travellerId,productId,dateToDeliver,dateDelivered,requestDate,pickUpAddress,collectionAddress,deliveryFee,Satisfactory,dhl_fee
0,0,24,26,0.0,d68bd4dedccc5545b1ff6629de8fb021,2023-08-07 08:16:00,2023-08-03 08:55:00,2023-07-28 08:16:00,24,26,4.686061,t,17.05
1,1,28,33,15.0,b864b103180d5a6a3f830e67f93e8735,2023-09-16 22:08:00,2023-09-10 21:55:00,2023-09-06 22:08:00,28,33,3.880162,t,16.78
2,2,27,23,15.0,be1be330cf34043d1c40d6114cbd11b5,2023-08-22 04:37:00,2023-08-14 10:21:00,2023-08-11 04:37:00,27,23,2.644626,t,17.05
3,3,22,38,3.0,062e425d963f3eac04709162c3ce6782,2023-06-24 16:55:00,2023-06-24 15:34:00,2023-06-18 16:55:00,22,38,3.459768,t,16.78
4,4,29,37,19.0,dbb399a8be7395d5b136d49fcdce13df,2023-09-09 11:56:00,2023-09-01 17:27:00,2023-08-29 11:56:00,29,37,3.633162,t,16.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,215,24,31,,c20fbbd4ccf2f7da70e150689a6a905f,2023-04-26 09:51:00,,2023-04-16 09:51:00,24,31,,,16.78
216,216,21,38,,dbb399a8be7395d5b136d49fcdce13df,2023-04-18 12:16:00,,2023-04-11 12:16:00,21,38,,,17.05
217,217,23,38,,483df9288035696f34225a7047591d13,2023-04-22 03:02:00,,2023-04-11 03:02:00,23,38,,,17.05
218,218,39,24,,3488d2ce36e718097c1509444289ef7f,2023-04-26 17:32:00,,2023-04-15 17:32:00,39,24,,,16.78


In [59]:
travels.toPandas()

Unnamed: 0,userId,departureAirportFsCode,arrivalAirportFsCode,departureTime,arrivalTime,extraLuggage
0,0,Madrid,Barcelona,2023-04-22 20:00:00,2023-04-22 21:20:00,4
1,0,Barcelona,Madrid,2023-04-25 12:40:00,2023-04-25 14:05:00,7
2,1,Madrid,Barcelona,2023-04-19 15:10:00,2023-04-19 16:35:00,4
3,1,Barcelona,Madrid,2023-04-22 18:05:00,2023-04-22 19:30:00,3
4,2,Madrid,Barcelona,2023-04-19 20:00:00,2023-04-19 21:20:00,7
5,2,Barcelona,Madrid,2023-04-23 18:05:00,2023-04-23 19:30:00,4
6,3,Palma,Málaga,2023-04-21 06:45:00,2023-04-21 08:20:00,5
7,3,Málaga,Palma,2023-04-24 20:40:00,2023-04-24 22:05:00,3
8,4,Madrid,Barcelona,2023-04-19 16:30:00,2023-04-19 17:50:00,6
9,4,Barcelona,Madrid,2023-04-23 11:35:00,2023-04-23 13:05:00,4


In [60]:
users.toPandas()

Unnamed: 0,user_id,gender,nationality,dob,is_traveller,address,city
0,0,Female,Spain,1963-09-25,t,0,Palma
1,1,Male,Spain,1996-12-12,t,1,Palma
2,2,Female,Spain,1976-07-13,t,2,Barcelona
3,3,Male,Spain,1981-03-04,t,3,Madrid
4,4,Female,Spain,1968-03-16,t,4,Madrid
5,5,Male,Spain,1993-06-26,t,5,Palma
6,6,Female,Spain,1966-08-14,t,6,Palma
7,7,Male,Spain,2003-09-18,t,7,Palma
8,8,Female,Spain,1997-08-20,t,8,Málaga
9,9,Male,Spain,1967-12-09,t,9,Barcelona


## 2. Feature Engineering

### 2.1 Min Max Scaling

In [6]:
vectorAssembler = VectorAssembler(inputCols = ['Distance', 'lpg_price',"diesel_price","gasoline_price",
                                               "product_weight_g", "product_length_cm",
                                               "product_height_cm","product_width_cm",
                                              "dhl_fee"], outputCol = 'features')

VA_requests = vectorAssembler.transform(requests)
VA_requests = VA_requests.select(["features","deliveryFee","Satisfactory","travellerId"])
VA_requests.take(3)

[Row(features=DenseVector([695.5684, 54.143, 53.5588, 71.1212, 700.0, 20.0, 20.0, 20.0, 17.05]), deliveryFee=4.025129718646807, Satisfactory=True, travellerId=4.0),
 Row(features=DenseVector([414.9577, 32.3003, 31.9517, 42.429, 500.0, 20.0, 12.0, 15.0, 16.78]), deliveryFee=3.810601927385716, Satisfactory=True, travellerId=12.0),
 Row(features=DenseVector([770.3568, 59.9646, 59.3175, 78.7682, 377.0, 18.0, 13.0, 15.0, 16.78]), deliveryFee=4.059784018164918, Satisfactory=True, travellerId=4.0)]

In [7]:
mms = MinMaxScaler(outputCol="scaled")

mms.setInputCol("features")
mms_fit = mms.fit(VA_requests)
transformed_data = mms_fit.transform(VA_requests)
transformed_data.toPandas()

Unnamed: 0,features,deliveryFee,Satisfactory,travellerId,scaled
0,"[695.5684257, 54.14304626, 53.55876878, 71.121...",4.025130,True,4.0,"[0.8676431001135353, 0.8676431002078994, 0.867..."
1,"[414.9576716, 32.30030516, 31.95174072, 42.429...",3.810602,True,12.0,"[0.37103150698417825, 0.37103150708283666, 0.3..."
2,"[770.3567915, 59.96457265, 59.31747294, 78.768...",4.059784,True,4.0,"[1.0, 1.0, 1.0, 1.0, 0.017564591502390287, 0.0..."
3,"[695.5684257, 54.14304626, 53.55876878, 71.121...",6.621363,True,3.0,"[0.8676431001135353, 0.8676431002078994, 0.867..."
4,"[506.9220158, 39.45880971, 39.03299521, 51.832...",4.589901,True,15.0,"[0.533785638167558, 0.5337856382009548, 0.5337..."
...,...,...,...,...,...
215,"[770.3567915, 59.96457265, 59.31747294, 78.768...",4.648742,True,,"[1.0, 1.0, 1.0, 1.0, 0.013428586775527742, 0.0..."
216,"[770.3567915, 59.96457265, 59.31747294, 78.768...",5.144249,True,,"[1.0, 1.0, 1.0, 1.0, 0.018800021485738838, 0.0..."
217,"[695.5684257, 54.14304626, 53.55876878, 71.121...",5.204829,True,,"[0.8676431001135353, 0.8676431002078994, 0.867..."
218,"[770.3567915, 59.96457265, 59.31747294, 78.768...",6.647297,True,,"[1.0, 1.0, 1.0, 1.0, 0.06714293387763871, 0.07..."


### 2.2 Train-test split

In [8]:
#We only use those satisfied-with-the-fee customers data for the training set.
training_set = transformed_data.filter(~transformed_data.travellerId.isNull() & transformed_data.Satisfactory == True)
testing_set =  transformed_data.filter(transformed_data.travellerId.isNull())

In [9]:
training_set.show(5)

+--------------------+-----------------+------------+-----------+--------------------+
|            features|      deliveryFee|Satisfactory|travellerId|              scaled|
+--------------------+-----------------+------------+-----------+--------------------+
|[695.5684257,54.1...|4.025129718646807|        true|        4.0|[0.86764310011353...|
|[414.9576716,32.3...|3.810601927385716|        true|       12.0|[0.37103150698417...|
|[770.3567915,59.9...|4.059784018164918|        true|        4.0|[1.0,1.0,1.0,1.0,...|
|[695.5684257,54.1...|6.621363124128837|        true|        3.0|[0.86764310011353...|
|[506.9220158,39.4...|4.589901468619425|        true|       15.0|[0.53378563816755...|
+--------------------+-----------------+------------+-----------+--------------------+
only showing top 5 rows



In [10]:
# The test data has the delivery fee but that's the actual delivery fee. 
# Those are requests that we need to predict the delivery fee and compare it between the actual
testing_set.show(5)

+--------------------+------------------+------------+-----------+--------------------+
|            features|       deliveryFee|Satisfactory|travellerId|              scaled|
+--------------------+------------------+------------+-----------+--------------------+
|[695.5684257,54.1...|  5.33256521955057|        true|       null|[0.86764310011353...|
|[506.9220158,39.4...|3.5762171241116745|       false|       null|[0.53378563816755...|
|[506.9220158,39.4...|   3.0061236140675|        true|       null|[0.53378563816755...|
|[770.3567915,59.9...| 5.602522124038321|        true|       null|[1.0,1.0,1.0,1.0,...|
|[205.3060396,15.9...| 2.935999887636217|        true|       null|[0.0,0.0,0.0,0.0,...|
+--------------------+------------------+------------+-----------+--------------------+
only showing top 5 rows



## 3. Regression Model

### 3.1 Simple Regression Model

In [31]:
lr = LinearRegression(featuresCol = 'scaled', labelCol='deliveryFee')
lr_model = lr.fit(training_set)

print("Coeficient of the linear model: ",np.round(lr_model.coefficients,4))
print("Intercept of the linear model:", np.round(lr_model.intercept,4))

Coeficient of the linear model:  [ 0.5295  0.5295  0.5295  0.5295 11.1815  0.3464 -1.5163 -0.5457  2.8323]
Intercept of the linear model: 2.7274


In [43]:
print("RMSE error of the linear regression model on the training set: ",np.round(lr_model.summary.rootMeanSquaredError,4))
print("MAE error of the linear regression model on the training set: ",np.round(lr_model.summary.meanAbsoluteError,4))
print("R Square of the linear regression model on the training set: ",np.round(lr_model.summary.r2,4))

RMSE error of the linear regression model on the training set:  1.0053
MAE error of the linear regression model on the training set:  0.8235
R Square of the linear regression model on the training set:  0.8691


In [44]:
print("RMSE error of the linear regression model on the test set: ",np.round(lr_model.evaluate(testing_set).rootMeanSquaredError,4))
print("MAE error of the linear regression model on the test set: ",np.round(lr_model.evaluate(testing_set).meanAbsoluteError,4))
print("R Square of the linear regression model on the test set: ",np.round(lr_model.evaluate(testing_set).r2,4))

RMSE error of the linear regression model on the test set:  0.9563
MAE error of the linear regression model on the test set:  0.8536
R Square of the linear regression model on the test set:  0.802


### 3.2 Lasso Regression Model

In [18]:
lasso_regression = LinearRegression(featuresCol="scaled", labelCol="deliveryFee", elasticNetParam=1)

In [45]:
#5-fold Cross Validation
grid_cv_lasso = ParamGridBuilder().addGrid(lasso_regression.regParam, [0.001, 0.01, 0.1, 1.0,10]).build()

evaluator_lasso = RegressionEvaluator(predictionCol="prediction", labelCol= "deliveryFee", metricName="rmse")

cross_validator_lasso = CrossValidator(estimator=lasso_regression,
                                 estimatorParamMaps=grid_cv_lasso,
                                 evaluator=evaluator_lasso,
                                 numFolds=5)

cv_lasso = cross_validator_lasso.fit(training_set)
best_lasso_regression = cv_lasso.bestModel

In [37]:
print("Coeficient of the best lasso model: ",np.round(best_lasso_regression.coefficients,4))
print("Intercept of the best lasso model:", np.round(best_lasso_regression.intercept,4))

Coeficient of the best lasso model:  [ 0.5202  0.5202  0.5202  0.5202  9.594   0.2763 -1.1783 -0.1595  4.0743]
Intercept of the best lasso model: 2.7011


In [42]:
print("RMSE error of the lasso model on the training set: ",np.round(best_lasso_regression.summary.rootMeanSquaredError,4))
print("MAE error of the lasso model on the training set: ",np.round(best_lasso_regression.summary.meanAbsoluteError,4))
print("R Square of the lasso model on the training set: ",np.round(best_lasso_regression.summary.r2,4))

RMSE error of the lasso model on the training set:  1.0067
MAE error of the lasso model on the training set:  0.8247
R Square of the lasso model on the training set:  0.8687


In [47]:
print("RMSE error of the lasso model on the test set: ",np.round(best_lasso_regression.evaluate(testing_set).rootMeanSquaredError,4))
print("MAE error of the lasso model on the test set: ",np.round(best_lasso_regression.evaluate(testing_set).meanAbsoluteError,4))
print("R Square of the lasso model on the test set: ",np.round(best_lasso_regression.evaluate(testing_set).r2,4))

RMSE error of the lasso model on the test set:  0.9554
MAE error of the lasso model on the test set:  0.8512
R Square of the lasso model on the test set:  0.8024


### 3.3 Ridge Regression Model

In [38]:
ridge_regression = LinearRegression(featuresCol="scaled", labelCol="deliveryFee", elasticNetParam=0)

In [46]:
#5-fold Cross Validation
grid_cv_ridge = ParamGridBuilder().addGrid(ridge_regression.regParam, [0.001, 0.01, 0.1, 1.0,10]).build()

evaluator_ridge = RegressionEvaluator(predictionCol="prediction", labelCol= "deliveryFee", metricName="rmse")

cross_validator_ridge = CrossValidator(estimator=ridge_regression,
                                 estimatorParamMaps=grid_cv_ridge,
                                 evaluator=evaluator_ridge,
                                 numFolds=5)

cv_ridge = cross_validator_ridge.fit(training_set)
best_ridge_regression = cv_ridge.bestModel

In [40]:
print("Coeficient of the best ridge model: ",np.round(best_ridge_regression.coefficients,4))
print("Intercept of the best ridge model:", np.round(best_ridge_regression.intercept,4))

Coeficient of the best ridge model:  [ 0.5293  0.5293  0.5293  0.5293 10.9319  0.3519 -1.4903 -0.519   3.062 ]
Intercept of the best ridge model: 2.7267


In [41]:
print("RMSE error of the ridge model on the training set: ",np.round(best_ridge_regression.summary.rootMeanSquaredError,4))
print("MAE error of the ridge model on the training set: ",np.round(best_ridge_regression.summary.meanAbsoluteError,4))
print("R Square of the ridge model on the training set: ",np.round(best_ridge_regression.summary.r2,4))

RMSE error of the ridge model on the training set:  1.0053
MAE error of the ridge model on the training set:  0.8235
R Square of the ridge model on the training set:  0.869


In [48]:
print("RMSE error of the ridge model on the test set: ",np.round(best_ridge_regression.evaluate(testing_set).rootMeanSquaredError,4))
print("MAE error of the ridge model on the test set: ",np.round(best_ridge_regression.evaluate(testing_set).meanAbsoluteError,4))
print("R Square of the ridge model on the test set: ",np.round(best_ridge_regression.evaluate(testing_set).r2,4))

RMSE error of the ridge model on the test set:  0.9555
MAE error of the ridge model on the test set:  0.8525
R Square of the ridge model on the test set:  0.8023
