# SparkML Logistic Regression Model
**Goal:** Build a logistic regression model using the taxicab data set to predict payment type.


### Create a Spark Session

In [1]:
%pip install pyspark
from pyspark.sql import SparkSession

# create a spark session
spark = SparkSession \
    .builder \
    .appName("spark_job") \
    .getOrCreate()

sc = spark.sparkContext

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


25/03/27 04:28:34 WARN Utils: Your hostname, codespaces-35d966 resolves to a loopback address: 127.0.0.1; using 10.0.3.203 instead (on interface eth0)
25/03/27 04:28:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/27 04:28:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Load the input data file


In [2]:
### Load Data
input_path='../data/green_tripdata_2024-01.parquet'
raw_df = spark \
        .read \
        .parquet(input_path)

raw_df.show()


                                                                                

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2024-01-01 00:46:55|  2024-01-01 00:58:25|                 N|         1|         236|         239|              1|         1.98|       12.8|  1.0|    0.

### Clean and filter data

In [None]:
# filter for payment types 1 and 2 only
input_df = raw_df.filter('...')

### Define Functions to vectorize input data

In [None]:
from pyspark.ml import Pipeline, Estimator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.sql import DataFrame as SparkDataFrame

# Prepare data for machine learning
# from dataframe categorical and numeric columns create label and features

def vectorizeCategories(labelCol: str, categoricalColumns:list[str]) -> list[Estimator]:
  stages = [] # stages in Pipeline
  for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer:Estimator = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder:Estimator = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add categorical stagess
    stages += [stringIndexer, encoder]
  #add label category
  label_stringIdx = StringIndexer(inputCol=labelCol, outputCol="label")
  stages += [label_stringIdx] 
  return stages

def createVectorizePipeline(labelCol: str, categoricalCols:list[str], numericCols:list[str]) -> Pipeline:
  categoricalStages = vectorizeCategories(labelCol, categoricalCols)
  assemblerInputs = [c + "classVec" for c in categoricalCols] + numericCols
  assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
  allStages:list[Estimator | VectorAssembler] = categoricalStages + [assembler]
  partialPipeline = Pipeline().setStages(allStages) # type: ignore
  return partialPipeline


### Select category to predict and input features

In [None]:
# label column is the feature to predict
label_col = '<feature to predict>'
categorical_feature_cols = [<categorical columns>]
numeric_feature_cols=[<numeric columns>]
vectorizePipeline = createVectorizePipeline(label_col, categorical_feature_cols, numeric_feature_cols)

# create vector dataframe
vectorizedModel = vectorizePipeline.fit(input_df)
vectorized_df = vectorizedModel.transform(input_df)
selectedcols = ["label", "features"] + [label_col] + categorical_feature_cols + numeric_feature_cols
mldata_df = vectorized_df.select(selectedcols)


### Train model

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(
    featuresCol="features", labelCol="label")

# split the data
train_df, test_df = mldata_df.randomSplit([0.8, 0.2])
# select the features and label
train_df = train_df.select("features", "label")
test_df = test_df.select("features", "label")


lrModel = lr.fit(train_df)
prediction_df = lrModel.transform(test_df)
prediction_df.show()

### Evaluate model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="label")
evaluator.evaluate(prediction_df)