In [7]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as fn
from pyspark.conf import SparkConf

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

from pyspark.ml.feature import MinMaxScaler
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
#Read Data From Postgres using PySpark
sc = SparkSession.builder\
    .config("spark.jars", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.driver.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.executor.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .getOrCreate()

sqlContext = SQLContext(sc)

requests = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/formatted_zone") \
    .option("dbtable", "new_requests") \
    .option("user", "postgres") \
    .option("password", "995507288") \
    .option("driver", "org.postgresql.Driver") \
    .load()





## 1. Exploration

In [3]:
requests.printSchema()

root
 |-- initializationUserId: long (nullable = true)
 |-- collectionUserId: long (nullable = true)
 |-- travellerId: double (nullable = true)
 |-- productId: string (nullable = true)
 |-- dateToDeliver: string (nullable = true)
 |-- dateDelivered: string (nullable = true)
 |-- requestDate: string (nullable = true)
 |-- pickUpAddress: long (nullable = true)
 |-- collectionAddress: long (nullable = true)
 |-- description: double (nullable = true)
 |-- deliveryFee: double (nullable = true)
 |-- startCity: string (nullable = true)
 |-- endCity: string (nullable = true)
 |-- Distance: double (nullable = true)
 |-- lpg_price: double (nullable = true)
 |-- diesel_price: double (nullable = true)
 |-- gasoline_price: double (nullable = true)
 |-- product_weight_g: long (nullable = true)
 |-- product_length_cm: long (nullable = true)
 |-- product_height_cm: long (nullable = true)
 |-- product_width_cm: long (nullable = true)
 |-- dhl_fee: double (nullable = true)
 |-- Satisfactory: boolean (nu

In [4]:
requests.take(1)

[Row(initializationUserId=26, collectionUserId=38, travellerId=4.0, productId='5773723f7819ffb88c5c758f8f8c183c', dateToDeliver='07/24/2023 05:52 AM', dateDelivered='07/20/2023 09:36 PM', requestDate='07/18/2023 05:52 AM', pickUpAddress=26, collectionAddress=38, description=None, deliveryFee=4.025129718646807, startCity='Palma', endCity='Málaga', Distance=695.5684257, lpg_price=54.14304626, diesel_price=53.55876878, gasoline_price=71.12117596, product_weight_g=700, product_length_cm=20, product_height_cm=20, product_width_cm=20, dhl_fee=17.05, Satisfactory=True)]

In [5]:
requests.toPandas()

Unnamed: 0,initializationUserId,collectionUserId,travellerId,productId,dateToDeliver,dateDelivered,requestDate,pickUpAddress,collectionAddress,description,deliveryFee,startCity,endCity,Distance,lpg_price,diesel_price,gasoline_price,product_weight_g,product_length_cm,product_height_cm,product_width_cm,dhl_fee,Satisfactory
0,26,38,4.0,5773723f7819ffb88c5c758f8f8c183c,07/24/2023 05:52 AM,07/20/2023 09:36 PM,07/18/2023 05:52 AM,26,38,,4.025130,Palma,Málaga,695.568426,54.143046,53.558769,71.121176,700,20,20,20,17.05,True
1,32,24,12.0,2d6676e04aac85ce66dcf8d0529f4012,10/03/2023 07:34 AM,09/26/2023 03:13 AM,09/21/2023 07:34 AM,32,24,,3.810602,Málaga,Madrid,414.957672,32.300305,31.951741,42.429007,500,20,12,15,16.78,True
2,37,38,4.0,8da90b37f0fb171b4877c124f965b1f6,06/15/2023 07:14 PM,06/07/2023 10:21 PM,06/02/2023 07:14 PM,37,38,,4.059784,Barcelona,Málaga,770.356791,59.964573,59.317473,78.768212,377,18,13,15,16.78,True
3,31,23,3.0,60cf4effb6947283fa638d570bd07318,08/02/2023 02:05 PM,07/25/2023 09:06 AM,07/22/2023 02:05 PM,31,23,,6.621363,Málaga,Palma,695.568426,54.143046,53.558769,71.121176,4500,80,70,28,22.48,True
4,22,39,15.0,fc37e09bb4dadd1eef5e92e954b1c72e,09/12/2023 07:17 AM,09/07/2023 12:01 PM,09/02/2023 07:17 AM,22,39,,4.589901,Madrid,Barcelona,506.922016,39.458810,39.032995,51.832269,500,30,6,23,16.78,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,25,35,,fc5c33cddd49638580cfe9fec90aa943,04/23/2023 08:33 AM,,04/15/2023 08:33 AM,25,35,,4.648742,Málaga,Barcelona,770.356791,59.964573,59.317473,78.768212,300,16,8,11,16.78,True
216,38,27,,1175dc374bce5c9d93c4d4281d8d85d7,04/18/2023 05:29 AM,,04/11/2023 05:29 AM,38,27,,5.144249,Málaga,Barcelona,770.356791,59.964573,59.317473,78.768212,400,18,18,18,16.78,True
217,26,31,,6a2fb4dd53d2cdb88e0432f1284a004c,04/20/2023 06:28 PM,,04/14/2023 06:28 PM,26,31,,5.204829,Palma,Málaga,695.568426,54.143046,53.558769,71.121176,400,27,5,20,17.05,True
218,36,25,,dc582e9ac5036846acfeeb3093b17aa7,04/19/2023 09:46 AM,,04/12/2023 09:46 AM,36,25,,6.647297,Barcelona,Málaga,770.356791,59.964573,59.317473,78.768212,1300,22,14,14,16.78,True


## 2. Regression

### 2.1 Min Max Scaling

In [24]:
vectorAssembler = VectorAssembler(inputCols = ['Distance', 'lpg_price',"diesel_price","gasoline_price",
                                               "product_weight_g", "product_length_cm",
                                               "product_height_cm","product_width_cm",
                                              "dhl_fee"], outputCol = 'features')

VA_requests = vectorAssembler.transform(requests)
VA_requests = VA_requests.select(["features","deliveryFee","Satisfactory","travellerId"])
VA_requests.take(3)

[Row(features=DenseVector([695.5684, 54.143, 53.5588, 71.1212, 700.0, 20.0, 20.0, 20.0, 17.05]), deliveryFee=4.025129718646807, Satisfactory=True, travellerId=4.0),
 Row(features=DenseVector([414.9577, 32.3003, 31.9517, 42.429, 500.0, 20.0, 12.0, 15.0, 16.78]), deliveryFee=3.810601927385716, Satisfactory=True, travellerId=12.0),
 Row(features=DenseVector([770.3568, 59.9646, 59.3175, 78.7682, 377.0, 18.0, 13.0, 15.0, 16.78]), deliveryFee=4.059784018164918, Satisfactory=True, travellerId=4.0)]

In [25]:
mms = MinMaxScaler(outputCol="scaled")

mms.setInputCol("features")
mms_fit = mms.fit(VA_requests)
transformed_data = mms_fit.transform(VA_requests)
transformed_data.toPandas()

Unnamed: 0,features,deliveryFee,Satisfactory,travellerId,scaled
0,"[695.5684257, 54.14304626, 53.55876878, 71.121...",4.025130,True,4.0,"[0.8676431001135353, 0.8676431002078994, 0.867..."
1,"[414.9576716, 32.30030516, 31.95174072, 42.429...",3.810602,True,12.0,"[0.37103150698417825, 0.37103150708283666, 0.3..."
2,"[770.3567915, 59.96457265, 59.31747294, 78.768...",4.059784,True,4.0,"[1.0, 1.0, 1.0, 1.0, 0.017564591502390287, 0.0..."
3,"[695.5684257, 54.14304626, 53.55876878, 71.121...",6.621363,True,3.0,"[0.8676431001135353, 0.8676431002078994, 0.867..."
4,"[506.9220158, 39.45880971, 39.03299521, 51.832...",4.589901,True,15.0,"[0.533785638167558, 0.5337856382009548, 0.5337..."
...,...,...,...,...,...
215,"[770.3567915, 59.96457265, 59.31747294, 78.768...",4.648742,True,,"[1.0, 1.0, 1.0, 1.0, 0.013428586775527742, 0.0..."
216,"[770.3567915, 59.96457265, 59.31747294, 78.768...",5.144249,True,,"[1.0, 1.0, 1.0, 1.0, 0.018800021485738838, 0.0..."
217,"[695.5684257, 54.14304626, 53.55876878, 71.121...",5.204829,True,,"[0.8676431001135353, 0.8676431002078994, 0.867..."
218,"[770.3567915, 59.96457265, 59.31747294, 78.768...",6.647297,True,,"[1.0, 1.0, 1.0, 1.0, 0.06714293387763871, 0.07..."


### 2.2 Train-test split

In [30]:
training_set = transformed_data.filter(~transformed_data.travellerId.isNull() & transformed_data.Satisfactory == True)
testing_set =  transformed_data.filter(transformed_data.travellerId.isNull())

In [33]:
training_set.show(5)

+--------------------+-----------------+------------+-----------+--------------------+
|            features|      deliveryFee|Satisfactory|travellerId|              scaled|
+--------------------+-----------------+------------+-----------+--------------------+
|[695.5684257,54.1...|4.025129718646807|        true|        4.0|[0.86764310011353...|
|[414.9576716,32.3...|3.810601927385716|        true|       12.0|[0.37103150698417...|
|[770.3567915,59.9...|4.059784018164918|        true|        4.0|[1.0,1.0,1.0,1.0,...|
|[695.5684257,54.1...|6.621363124128837|        true|        3.0|[0.86764310011353...|
|[506.9220158,39.4...|4.589901468619425|        true|       15.0|[0.53378563816755...|
+--------------------+-----------------+------------+-----------+--------------------+
only showing top 5 rows



In [34]:
# The test data has the delivery fee but that's the actual delivery fee. 
# Those are requests that we need to predict the delivery fee and compare it between the actual
testing_set.show(5)

+--------------------+------------------+------------+-----------+--------------------+
|            features|       deliveryFee|Satisfactory|travellerId|              scaled|
+--------------------+------------------+------------+-----------+--------------------+
|[695.5684257,54.1...|  5.33256521955057|        true|       null|[0.86764310011353...|
|[506.9220158,39.4...|3.5762171241116745|       false|       null|[0.53378563816755...|
|[506.9220158,39.4...|   3.0061236140675|        true|       null|[0.53378563816755...|
|[770.3567915,59.9...| 5.602522124038321|        true|       null|[1.0,1.0,1.0,1.0,...|
|[205.3060396,15.9...| 2.935999887636217|        true|       null|[0.0,0.0,0.0,0.0,...|
+--------------------+------------------+------------+-----------+--------------------+
only showing top 5 rows



### 2.3 Simple Regression Model

In [35]:
lr = LinearRegression(featuresCol = 'scaled', labelCol='deliveryFee', maxIter=100)
lr_model = lr.fit(training_set)

print("MSE error of the training set: ",lr_model.summary.rootMeanSquaredError)

MSE error of the training set:  1.0053331130641567


In [36]:
print("MSE error of the test set: ",lr_model.evaluate(testing_set).rootMeanSquaredError)

MSE error of the test set:  0.9563014592267534
