In [25]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as fn
from pyspark.conf import SparkConf

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import pandas as pd
pd.set_option('display.max_columns', None)

In [14]:
#Read Data From Postgres using PySpark
sc = SparkSession.builder\
    .config("spark.jars", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.driver.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .config("spark.executor.extraClassPath", "./postgresql_jdbc/postgresql-42.6.0.jar")\
    .getOrCreate()

sqlContext = SQLContext(sc)

requests = sc.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/formatted_zone") \
    .option("dbtable", "new_requests") \
    .option("user", "postgres") \
    .option("password", "995507288") \
    .option("driver", "org.postgresql.Driver") \
    .load()





## 1. Exploration

In [15]:
requests.printSchema()

root
 |-- initializationUserId: long (nullable = true)
 |-- collectionUserId: long (nullable = true)
 |-- travellerId: double (nullable = true)
 |-- productId: string (nullable = true)
 |-- dateToDeliver: string (nullable = true)
 |-- dateDelivered: string (nullable = true)
 |-- requestDate: string (nullable = true)
 |-- pickUpAddress: long (nullable = true)
 |-- collectionAddress: long (nullable = true)
 |-- description: double (nullable = true)
 |-- deliveryFee: double (nullable = true)
 |-- startCity: string (nullable = true)
 |-- endCity: string (nullable = true)
 |-- Distance: double (nullable = true)
 |-- lpg_price: double (nullable = true)
 |-- diesel_price: double (nullable = true)
 |-- gasoline_price: double (nullable = true)
 |-- product_weight_g: long (nullable = true)
 |-- product_length_cm: long (nullable = true)
 |-- product_height_cm: long (nullable = true)
 |-- product_width_cm: long (nullable = true)
 |-- dhl_fee: double (nullable = true)
 |-- Satisfactory: boolean (nu

In [16]:
requests.take(1)

[Row(initializationUserId=26, collectionUserId=38, travellerId=4.0, productId='5773723f7819ffb88c5c758f8f8c183c', dateToDeliver='07/24/2023 05:52 AM', dateDelivered='07/20/2023 09:36 PM', requestDate='07/18/2023 05:52 AM', pickUpAddress=26, collectionAddress=38, description=None, deliveryFee=4.025129718646807, startCity='Palma', endCity='Málaga', Distance=695.5684257, lpg_price=54.14304626, diesel_price=53.55876878, gasoline_price=71.12117596, product_weight_g=700, product_length_cm=20, product_height_cm=20, product_width_cm=20, dhl_fee=17.05, Satisfactory=True)]

In [17]:
requests.toPandas()

Unnamed: 0,initializationUserId,collectionUserId,travellerId,productId,dateToDeliver,dateDelivered,requestDate,pickUpAddress,collectionAddress,description,deliveryFee,startCity,endCity,Distance,lpg_price,diesel_price,gasoline_price,product_weight_g,product_length_cm,product_height_cm,product_width_cm,dhl_fee,Satisfactory
0,26,38,4.0,5773723f7819ffb88c5c758f8f8c183c,07/24/2023 05:52 AM,07/20/2023 09:36 PM,07/18/2023 05:52 AM,26,38,,4.025130,Palma,Málaga,695.568426,54.143046,53.558769,71.121176,700,20,20,20,17.05,True
1,32,24,12.0,2d6676e04aac85ce66dcf8d0529f4012,10/03/2023 07:34 AM,09/26/2023 03:13 AM,09/21/2023 07:34 AM,32,24,,3.810602,Málaga,Madrid,414.957672,32.300305,31.951741,42.429007,500,20,12,15,16.78,True
2,37,38,4.0,8da90b37f0fb171b4877c124f965b1f6,06/15/2023 07:14 PM,06/07/2023 10:21 PM,06/02/2023 07:14 PM,37,38,,4.059784,Barcelona,Málaga,770.356791,59.964573,59.317473,78.768212,377,18,13,15,16.78,True
3,31,23,3.0,60cf4effb6947283fa638d570bd07318,08/02/2023 02:05 PM,07/25/2023 09:06 AM,07/22/2023 02:05 PM,31,23,,6.621363,Málaga,Palma,695.568426,54.143046,53.558769,71.121176,4500,80,70,28,22.48,True
4,22,39,15.0,fc37e09bb4dadd1eef5e92e954b1c72e,09/12/2023 07:17 AM,09/07/2023 12:01 PM,09/02/2023 07:17 AM,22,39,,4.589901,Madrid,Barcelona,506.922016,39.458810,39.032995,51.832269,500,30,6,23,16.78,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,25,35,,fc5c33cddd49638580cfe9fec90aa943,04/23/2023 08:33 AM,,04/15/2023 08:33 AM,25,35,,4.648742,Málaga,Barcelona,770.356791,59.964573,59.317473,78.768212,300,16,8,11,16.78,True
216,38,27,,1175dc374bce5c9d93c4d4281d8d85d7,04/18/2023 05:29 AM,,04/11/2023 05:29 AM,38,27,,5.144249,Málaga,Barcelona,770.356791,59.964573,59.317473,78.768212,400,18,18,18,16.78,True
217,26,31,,6a2fb4dd53d2cdb88e0432f1284a004c,04/20/2023 06:28 PM,,04/14/2023 06:28 PM,26,31,,5.204829,Palma,Málaga,695.568426,54.143046,53.558769,71.121176,400,27,5,20,17.05,True
218,36,25,,dc582e9ac5036846acfeeb3093b17aa7,04/19/2023 09:46 AM,,04/12/2023 09:46 AM,36,25,,6.647297,Barcelona,Málaga,770.356791,59.964573,59.317473,78.768212,1300,22,14,14,16.78,True


## 2. Regression

In [23]:
requests_regression_data = requests.filter(requests.Satisfactory == True)

vectorAssembler = VectorAssembler(inputCols = ['Distance', 'lpg_price',"diesel_price","gasoline_price",
                                               "product_weight_g", "product_length_cm",
                                               "product_height_cm","product_width_cm",
                                              "dhl_fee"], outputCol = 'features')

VA_requests = vectorAssembler.transform(requests_regression_data)
VA_requests = VA_requests.select(["features","deliveryFee"])
VA_requests.take(3)

[Row(features=DenseVector([695.5684, 54.143, 53.5588, 71.1212, 700.0, 20.0, 20.0, 20.0, 17.05]), deliveryFee=4.025129718646807),
 Row(features=DenseVector([414.9577, 32.3003, 31.9517, 42.429, 500.0, 20.0, 12.0, 15.0, 16.78]), deliveryFee=3.810601927385716),
 Row(features=DenseVector([770.3568, 59.9646, 59.3175, 78.7682, 377.0, 18.0, 13.0, 15.0, 16.78]), deliveryFee=4.059784018164918)]

In [24]:
training_set = VA_requests.randomSplit([0.8,0.2],seed=42)[0]
testing_set =  VA_requests.randomSplit([0.8,0.2],seed=42)[1]

In [37]:
lr = LinearRegression(featuresCol = 'features', labelCol='deliveryFee', maxIter=100)
lr_model = lr.fit(training_set)

print("MSE error of the training set: ",lr_model.summary.rootMeanSquaredError)

MSE error of the training set:  0.9818843088854207


In [36]:
print("MSE error of the test set: ",lr_model.evaluate(testing_set).rootMeanSquaredError)

MSE error of the test set:  1.1155435912343585
