In [1]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from statsmodels.formula.api import ols, glm
import statsmodels.api as sm
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = (
 SparkSession.builder.appName("project 1 LR")
 .config("spark.sql.repl.eagerEval.enabled", True)
 .config("spark.executor.memory","4G")
 .config("spark.driver.memory","2G")
 .config("spark.sql.parquet.cacheMetadata", "true")
 .config("spark.sql.session.timeZone", "Etc/UTC")
 .config('spark.driver.maxResultSize', '2048m')
 .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 00:38:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/21 00:38:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/21 00:38:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/08/21 00:38:19 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
data = spark.read.parquet('../data/curated/merged_sdf.parquet')

                                                                                

## Adjusting the columns to train the model ##

In [4]:
model_feature = ['duration', 'extra', 'Weekend', 'Airport', 'Congestion','tip_amount']
selected_data = data.select(*[col(col_name) for col_name in model_feature])

In [5]:
input_column = ['duration', 'extra', 'Weekend', 'Airport', 'Congestion']
assembler = VectorAssembler(inputCols=input_column, outputCol='features')
assembled_data = assembler.transform(selected_data)

# Split the data into training and testing sets
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=0)

## Model training ##

In [6]:
# Take 3 mins to run
for i in [0.0001, 0.001, 0.01]:
    lm = LinearRegression(
        featuresCol='features', 
        labelCol='tip_amount',
        regParam=i, 
        elasticNetParam=0.5
    ).fit(train_data)
    predictions = lm.transform(test_data)
    evaluator = RegressionEvaluator(labelCol='tip_amount', predictionCol='prediction', metricName='rmse')
    rmse = evaluator.evaluate(predictions)
    print('R_sqr:', lm.summary.r2, ", Root Mean Squared Error:", rmse, ", regParam:", i)



23/08/21 00:38:43 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

R_sqr: 0.4741546893361698 , Root Mean Squared Error: 2.798349130756582 , regParam: 0.0001


                                                                                

R_sqr: 0.47415457081858525 , Root Mean Squared Error: 2.798348026354654 , regParam: 0.001




R_sqr: 0.4741427504745639 , Root Mean Squared Error: 2.7983656080421015 , regParam: 0.01


                                                                                