# SparkML Logistic Regression Model
**Goal:** Build a logistic regression model using the taxicab data set to predict payment type.


### Create a Spark Session

In [1]:
import os
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Ensure native BLAS can be found
os.environ['LD_LIBRARY_PATH'] = '/usr/lib:/usr/lib/x86_64-linux-gnu'

# Direct JVM to use system BLAS
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", 
         "-Dcom.github.fommil.netlib.NativeSystemBLAS.natives=/usr/lib/x86_64-linux-gnu")
conf.set("spark.executor.extraJavaOptions", 
         "-Dcom.github.fommil.netlib.NativeSystemBLAS.natives=/usr/lib/x86_64-linux-gnu")

# Create SparkContext and SparkSession
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

25/03/27 16:15:23 WARN Utils: Your hostname, codespaces-35d966 resolves to a loopback address: 127.0.0.1; using 10.0.3.120 instead (on interface eth0)
25/03/27 16:15:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/27 16:15:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Load the input data file


In [2]:
### Load Data
input_path='../data/green_tripdata_2024-01.parquet'
raw_df = spark \
        .read \
        .parquet(input_path)

raw_df.printSchema()
raw_df.show(5)
raw_df.describe().show()

                                                                                

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: long (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

+--------+--------------------+---------------------+------------------+----------+-----

25/03/27 16:15:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------+---------------------+------------------+------------------+-------------------+--------------------+
|summary|           VendorID|store_and_fwd_flag|        RatecodeID|     PULocationID|     DOLocationID|   passenger_count|     trip_distance|       fare_amount|             extra|           mta_tax|        tip_amount|       tolls_amount|ehail_fee|improvement_surcharge|      total_amount|      payment_type|          trip_type|congestion_surcharge|
+-------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------+---------------------+------------------+--------------

                                                                                

### Clean and filter data

In [3]:
# filter for payment types 1 and 2 only
input_df = raw_df.filter((raw_df.payment_type == 1) | (raw_df.payment_type == 2))
input_df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2024-01-01 00:46:55|  2024-01-01 00:58:25|                 N|         1|         236|         239|              1|         1.98|       12.8|  1.0|    0.

### Define Functions to vectorize input data

In [4]:
from pyspark.ml import Pipeline, Estimator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.sql import DataFrame as SparkDataFrame

# Prepare data for machine learning
# from dataframe categorical and numeric columns create label and features

def vectorizeCategories(labelCol: str, categoricalColumns:list[str]) -> list[Estimator]:
  stages = [] # stages in Pipeline
  for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer:Estimator = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder:Estimator = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add categorical stagess
    stages += [stringIndexer, encoder]
  #add label category
  label_stringIdx = StringIndexer(inputCol=labelCol, outputCol="label")
  stages += [label_stringIdx] 
  return stages

def createVectorizePipeline(labelCol: str, categoricalCols:list[str], numericCols:list[str]) -> Pipeline:
  categoricalStages = vectorizeCategories(labelCol, categoricalCols)
  assemblerInputs = [c + "classVec" for c in categoricalCols] + numericCols
  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
  allStages:list[Estimator | VectorAssembler] = categoricalStages + [assembler]
  partialPipeline = Pipeline().setStages(allStages) # type: ignore
  return partialPipeline


### Select category to predict and input features

In [5]:
# label column is the feature to predict
label_col = 'payment_type'
categorical_feature_cols = ['PULocationID', 'DOLocationID', 'RatecodeID', 'VendorID']
numeric_feature_cols = ['trip_distance', 'passenger_count', 'fare_amount'] 
vectorizePipeline = createVectorizePipeline(label_col, categorical_feature_cols, numeric_feature_cols)

# create vector dataframe
vectorizedModel = vectorizePipeline.fit(input_df)
vectorized_df = vectorizedModel.transform(input_df)
selectedcols = ["label", "features"] + [label_col] + categorical_feature_cols + numeric_feature_cols
mldata_df = vectorized_df.select(selectedcols)

vectorized_df.select("label", "features", label_col).show(5, truncate=False)

+-----+------------------------------------------------------------------+------------+
|label|features                                                          |payment_type|
+-----+------------------------------------------------------------------+------------+
|0.0  |(457,[23,217,448,453,454,455,456],[1.0,1.0,1.0,1.0,1.98,1.0,12.8])|1           |
|0.0  |(457,[8,265,448,453,454,455,456],[1.0,1.0,1.0,1.0,6.54,5.0,30.3]) |1           |
|0.0  |(457,[0,223,448,453,454,455,456],[1.0,1.0,1.0,1.0,3.08,1.0,19.8]) |1           |
|1.0  |(457,[0,229,448,454,455,456],[1.0,1.0,1.0,2.4,1.0,14.2])          |2           |
|0.0  |(457,[0,252,448,453,454,455,456],[1.0,1.0,1.0,1.0,5.14,1.0,22.6]) |1           |
+-----+------------------------------------------------------------------+------------+
only showing top 5 rows



### Train model

In [8]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(
    featuresCol="features", labelCol="label")

# split the data
train_df, test_df = mldata_df.randomSplit([0.8, 0.2])
# select the features and label
train_df = train_df.select("features", "label")
test_df = test_df.select("features", "label")

lrModel = lr.fit(train_df)
prediction_df = lrModel.transform(test_df)
prediction_df.show(5)

                                                                                

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(457,[0,208,448,4...|  0.0|[1.13703858405094...|[0.75713550464156...|       0.0|
|(457,[0,208,448,4...|  0.0|[1.13355746771912...|[0.75649481949053...|       0.0|
|(457,[0,208,448,4...|  0.0|[1.14052828416109...|[0.75777661940753...|       0.0|
|(457,[0,208,448,4...|  0.0|[1.14052923791424...|[0.75777679447004...|       0.0|
|(457,[0,208,448,4...|  0.0|[1.13355985210199...|[0.75649525871840...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



                                                                                

### Evaluate model

In [7]:
# Import evaluation metrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
# Binary classification evaluator
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)
auc = binary_evaluator.evaluate(prediction_df)
# Multi-class metrics
multi_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction"
)
accuracy = multi_evaluator.setMetricName("accuracy").evaluate(prediction_df)
precision = multi_evaluator.setMetricName("weightedPrecision").evaluate(prediction_df)
recall = multi_evaluator.setMetricName("weightedRecall").evaluate(prediction_df)
f1 = multi_evaluator.setMetricName("f1").evaluate(prediction_df)

# Print summary statistics from the model
print("Training Summary:")
print(f"Total Iterations: {lrModel.summary.totalIterations}")
print(f"Objective History: {lrModel.summary.objectiveHistory}")
# Show prediction dataframe separately
print("\nPrediction DataFrame (5 rows):")
prediction_df.show(5)
print(f"\nCoefficients: {lrModel.coefficients}")
print(f"Intercept: {lrModel.intercept}")
print(f"\nModel evaluation metrics:")
print(f"AUC: {auc}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


                                                                                

Training Summary:
Total Iterations: 78
Objective History: [0.6118169123699311, 0.5656090059273127, 0.5393352917920784, 0.5370720724034894, 0.5365175135025272, 0.5361331161275766, 0.5361183902063451, 0.5360490525336632, 0.5360350780020593, 0.5360092827653771, 0.5359901859722301, 0.535981521216976, 0.5359625423245495, 0.5359602745085603, 0.5359584030325288, 0.5359561235042655, 0.5359545364907133, 0.5359537404765848, 0.5359525951217499, 0.5359514505465928, 0.5359509608796575, 0.5359491903841633, 0.5359487115375686, 0.5359481085984688, 0.5359472539256593, 0.5359462096660261, 0.5359446408571802, 0.535943283701738, 0.535941837185255, 0.5359390751448553, 0.5359371154916379, 0.5359347916961873, 0.5359333250066735, 0.5359320412748285, 0.5359313330786118, 0.5359285727147263, 0.5359274382298388, 0.5359263924136635, 0.535925311427409, 0.5359249200130667, 0.5359245886838612, 0.5359243730593968, 0.5359241721534737, 0.5359239340340979, 0.535923710559596, 0.5359234225492184, 0.5359230408605041, 0.5359

                                                                                