In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import time
import datetime
# from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("MySparkSession") \
                            .config("spark.master", "local[10]") \
                            .config("spark.driver.cores", "1") \
                            .config("spark.driver.memory", "1g") \
                            .config("spark.memory.offHeap.enabled", "true") \
                            .config("spark.memory.offHeap.size", "1g") \
                            .getOrCreate()


df = spark.read.parquet('../time_series/TS6/TS6-201902_202306.parquet')
df = df.drop("is_holiday", 'TempTime', '__index_level_0__')
df = df.repartition(4)
mapping = {'yellow': 1, 
           'lyft': 2, 
           'uber': 3}

for key, value in mapping.items():
    df = df.withColumn("Name", when(df["Name"] == key, value).otherwise(df["Name"]))


for c in ['Name', 'year', 'month', 'day', 'hour', 'PULocationID']:
    df = df.withColumn(c, col(c).cast('integer'))
feature_variables = df.drop('count', 'countN')

inputcols = feature_variables.columns

assembler = VectorAssembler(inputCols=inputcols, outputCol="features")

df_valid = df.filter((col("year") == 2023) & (col("month") > 3))


df.filter(
    (col("year") == 2022) |
    ((col("year") == 2023) & (col("month") <= 3))
)

DataFrame[Name: int, year: int, month: int, day: int, hour: int, PULocationID: int, weekday: int, count: int, lat: double, lon: double, countN: double]

In [2]:
output = assembler.transform(df)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=False)

scaled_data = scaler.fit(output).transform(output)

scaled_data.select('scaled_features', 'countN')

final_data = scaled_data.select('scaled_features', 'countN')

train, test = final_data.randomSplit([0.9, 0.1], seed=42)

In [3]:
print("start時間:", datetime.datetime.now())
start_time = time.time()

gbtr = GBTRegressor(featuresCol='scaled_features', labelCol='countN', maxDepth=10, maxIter=100, stepSize=0.1, subsamplingRate=0.8, cacheNodeIds=True, seed=1, maxMemoryInMB=10240, maxBins=64)

gbtr_model = gbtr.fit(train)

y_pred = gbtr_model.transform(test)

y_pred.select('countN', 'prediction')

end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

start時間: 2023-09-20 00:37:07.490677
執行時間: 29352.56973004341 秒


In [4]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='countN')

r2 = evaluator.evaluate(y_pred, {evaluator.metricName: 'r2'})
mae = evaluator.evaluate(y_pred, {evaluator.metricName: 'mae'})
rmse = evaluator.evaluate(y_pred, {evaluator.metricName: 'rmse'})

print(f'R2: {r2}')
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')

R2: 0.950129150365152
MAE: 6.912925560613276
RMSE: 13.809608929684389


In [5]:
output_df_valid = assembler.transform(df_valid)


scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=False)

scaled_data = scaler.fit(output_df_valid).transform(output_df_valid)

valid_data = scaled_data.select('scaled_features', 'count')



y_pred_2023 = gbtr_model.transform(valid_data)
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='count')

r2 = evaluator.evaluate(y_pred_2023, {evaluator.metricName: 'r2'})
mae = evaluator.evaluate(y_pred_2023, {evaluator.metricName: 'mae'})
rmse = evaluator.evaluate(y_pred_2023, {evaluator.metricName: 'rmse'})

print(f'R2: {r2}')
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')

R2: 0.7138127726500585
MAE: 16.040662946842154
RMSE: 35.51617972982376
