<a href="https://colab.research.google.com/github/VladAndrei25/BigData-ChicagoTaxi-PySpark/blob/main/LA_BigData_ChicagoTaxi_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("LA-BigData-ChicagoTaxi") \
    .getOrCreate()

spark


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
df = spark.read.csv(
    "/content/drive/MyDrive/bq-results-20260128-103549-1769596661440/bq-results-20260128-103549-1769596661440.csv",
    header=True,
    inferSchema=True
)

df.printSchema()
df.show(5)


root
 |-- trip_seconds: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- fare: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- extras: double (nullable = true)
 |-- trip_total: double (nullable = true)
 |-- hour: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- pickup_community_area: integer (nullable = true)
 |-- dropoff_community_area: integer (nullable = true)

+------------+----------+----+----+-----+------+----------+----+---------+---------------------+----------------------+
|trip_seconds|trip_miles|fare|tips|tolls|extras|trip_total|hour|dayofweek|pickup_community_area|dropoff_community_area|
+------------+----------+----+----+-----+------+----------+----+---------+---------------------+----------------------+
|         420|       1.2| 0.0| 0.0|  0.0|   3.0|       3.0|  21|        5|                 NULL|                  NULL|
|         480|       5.5| 0.0| 0.0|  0.0|   0.0|  

In [None]:
df_clean = df.dropna(subset=[
    "trip_seconds",
    "trip_miles",
    "trip_total",
    "hour",
    "dayofweek"
])

df_clean.count()


500000

In [None]:
df_clean.describe().show()


+-------+------------------+------------------+-----------------+------------------+--------------------+------------------+-----------------+-----------------+------------------+---------------------+----------------------+
|summary|      trip_seconds|        trip_miles|             fare|              tips|               tolls|            extras|       trip_total|             hour|         dayofweek|pickup_community_area|dropoff_community_area|
+-------+------------------+------------------+-----------------+------------------+--------------------+------------------+-----------------+-----------------+------------------+---------------------+----------------------+
|  count|            500000|            500000|           500000|            500000|              403522|            500000|           500000|           500000|            500000|               116166|                115024|
|   mean|        525.158912|  1.95775733999993|8.045245359995764|0.7597235799999992|2.52501722334841

In [None]:
df_clean.groupBy("hour") \
    .count() \
    .orderBy("hour") \
    .show()


+----+-----+
|hour|count|
+----+-----+
|   0|15677|
|   1|12957|
|   2| 9536|
|   3| 6045|
|   4| 3713|
|   5| 4011|
|   6| 8442|
|   7|17259|
|   8|23003|
|   9|23260|
|  10|22722|
|  11|26448|
|  12|27817|
|  13|27919|
|  14|26790|
|  15|27322|
|  16|28033|
|  17|29143|
|  18|33544|
|  19|31855|
+----+-----+
only showing top 20 rows


In [None]:
df_clean.groupBy("dayofweek") \
    .avg("trip_total") \
    .orderBy("dayofweek") \
    .show()


+---------+-----------------+
|dayofweek|  avg(trip_total)|
+---------+-----------------+
|        1|9.518343297532272|
|        2|9.670749022313831|
|        3|9.595217970843867|
|        4|9.427067522093214|
|        5|9.634596857408729|
|        6|9.070756885255374|
|        7|8.672649072003612|
+---------+-----------------+



In [None]:
df_clean.select("trip_miles", "trip_total").stat.corr("trip_miles", "trip_total")


0.9156996288673264

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [None]:
feature_cols = ["trip_seconds", "trip_miles", "hour", "dayofweek"]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

data_ml = assembler.transform(df_clean)


In [None]:
train_data, test_data = data_ml.randomSplit([0.8, 0.2], seed=42)


In [None]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="trip_total"
)

lr_model = lr.fit(train_data)

lr_predictions = lr_model.transform(test_data)


In [None]:
evaluator_rmse = RegressionEvaluator(
    labelCol="trip_total",
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_r2 = RegressionEvaluator(
    labelCol="trip_total",
    predictionCol="prediction",
    metricName="r2"
)

lr_rmse = evaluator_rmse.evaluate(lr_predictions)
lr_r2 = evaluator_r2.evaluate(lr_predictions)

lr_rmse, lr_r2


(3.791822850265409, 0.838429304067866)

In [None]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="trip_total",
    numTrees=50,
    maxDepth=10,
    seed=42
)

rf_model = rf.fit(train_data)

rf_predictions = rf_model.transform(test_data)

rf_rmse = evaluator_rmse.evaluate(rf_predictions)
rf_r2 = evaluator_r2.evaluate(rf_predictions)

rf_rmse, rf_r2


(3.7279614352264896, 0.8438257823887226)

In [None]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(
    featuresCol="features",
    labelCol="trip_total",
    maxIter=50,
    maxDepth=5,
    seed=42
)

gbt_model = gbt.fit(train_data)

gbt_predictions = gbt_model.transform(test_data)

gbt_rmse = evaluator_rmse.evaluate(gbt_predictions)
gbt_r2 = evaluator_r2.evaluate(gbt_predictions)

gbt_rmse, gbt_r2


(3.726251233073477, 0.8439690393532588)

In [None]:
results = spark.createDataFrame([
    ("Linear Regression", lr_rmse, lr_r2),
    ("Random Forest", rf_rmse, rf_r2),
    ("Gradient Boosted Trees", gbt_rmse, gbt_r2)
], ["Model", "RMSE", "R2"])

results.show()


+--------------------+------------------+------------------+
|               Model|              RMSE|                R2|
+--------------------+------------------+------------------+
|   Linear Regression| 3.791822850265409| 0.838429304067866|
|       Random Forest|3.7279614352264896|0.8438257823887226|
|Gradient Boosted ...| 3.726251233073477|0.8439690393532588|
+--------------------+------------------+------------------+

