In [None]:
// Ingesting the Main MTA subway ridership file.

val filePath = "bdad_proj/MTA_Subway_Hourly_Ridership__Beginning_February_2022_202404.csv"

val MTADf = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)
z.show(MTADf)

In [1]:
//Extracting the lats and longs in my main MTA dataset for generating its associated zipcodes

val columnsToSelect = Seq("latitude", "longitude")

// Created a dataset with distinct lats and longs
val LatLngDf = MTADf.select(columnsToSelect.head, columnsToSelect.tail: _*).distinct()


z.show(LatLngDf)

In [2]:
// Using a dataset of US zipcodes with associated Latitude and Longitudes
val filePath_USZip = "bdad_proj/USZipCodes.csv"

val USZipDf = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath_USZip)
  
println(USZipDf.count())
  
z.show(USZipDf)

In [3]:
// Used a dataset with all the zipcodes in New York City. This helps us cut down on the USzip codes dataframe size

val zip_filePath = "bdad_proj/NYCZipCodes.csv"

val ZipDf = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(zip_filePath)

z.show(ZipDf)
println(ZipDf.count())


In [4]:

// Performing an inner join between ZipDf and USZipDf on the ZIP code column
val joinedDF = USZipDf.join(ZipDf, USZipDf("ZIP") === ZipDf("ZIPCODES"), "inner")

// Selecting the columns you want to keep from the joined DataFrame
val result_Zip_DF = joinedDF.select("ZIP", "LAT", "LNG")

// Showing the resulting DataFrame
z.show(result_Zip_DF)
println(result_Zip_DF.count())

In [5]:
// Removing the unnecessary columns like transit_mode, counties and NYS Municipal Boundaries.

val filteredMTADf = MTADf.drop("transit_mode", "ridership", "transfers", "Counties", "NYS Municipal Boundaries", "Georeference", "New York Zip Codes")
z.show(filteredMTADf.limit(50))

In [6]:
// Cleaning the timestamp and extracting time and date from the timestamp data

import org.apache.spark.sql.functions.{to_date, date_format, to_timestamp}


val filteredMTADf_1 = filteredMTADf
                .withColumn("transit_timestamp", to_timestamp($"transit_timestamp", "MM/dd/yyyy hh:mm:ss a"))
                .withColumn("transit_date", to_date($"transit_timestamp"))
                .withColumn("transit_time", date_format($"transit_timestamp","hh:mm:ss a" ))

z.show(filteredMTADf_1.limit(100))
            

In [7]:
// Using Haversine formula to generated zipcodes associated with the unique lat and longs present in the main datset using the US Zip codes datset as a reference

import org.apache.spark.sql.expressions.Window

def haversine(lat1: Double, lon1: Double, lat2: Double, lon2: Double): Double = {
    val latDistance = Math.toRadians(lat1 - lat2)
    val lngDistance = Math.toRadians(lon1 - lon2)
    val sinLat = Math.sin(latDistance / 2)
    val sinLng = Math.sin(lngDistance / 2)
    val a = sinLat * sinLat +
    (Math.cos(Math.toRadians(lat1)) *
        Math.cos(Math.toRadians(lat2)) *
        sinLng * sinLng)
    val c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a))
    (6371 * c).toInt
}

val getNearestZipCodes = udf((lat1: Double, lon1: Double, lat2: Double, lon2: Double) =>
    haversine(lat1, lon1, lat2, lon2)
)

val radius = 0.5

val broadcastUSZipDf = broadcast(result_Zip_DF)

val result = LatLngDf.crossJoin(broadcastUSZipDf)
    .withColumn("distance", getNearestZipCodes($"LAT", $"LNG", $"latitude", $"longitude"))
    .filter($"distance" < radius)
    .groupBy($"latitude", $"longitude")
    .agg(min($"ZIP").as("nearest_zipcode"))
    


result.show()




In [8]:
// Performing an inner join between ZipDf and USZipDf on the ZIP code column to attach the associated zipcodes to the main dataset
val joinedDF = filteredMTADf_1.join(result, filteredMTADf_1("latitude") === result("latitude") && filteredMTADf_1("longitude") === result("longitude") , "left")

// Show the resulting DataFrame
z.show(joinedDF)


    

In [9]:
// Further dropping Lat and Long data because now it is not needed for the future analytics purpose

val dfWithoutDuplicates = joinedDF.drop("latitude", "longitude")
z.show(dfWithoutDuplicates)

In [10]:
// Saving the final cleaned dataset into hdfs

val outputPath = "hdfs:///user/vp2359_nyu_edu/bdad_proj/ZipCodedLatsLongs.csv"

dfWithoutDuplicates.write
.option("header", "true") // Include column headers
.csv(outputPath)

In [11]:
// Extracting the cleaned dataset back into Zeppelin

val MTA_Final_Df = spark.read.option("header", "true").option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"").csv("bdad_proj/ZipCodedLatsLongs.csv/part-*.csv")

In [12]:

val MTA_Final = MTA_Final_Df.withColumnRenamed("nearest_zipcode", "zipcode")
z.show(MTA_Final.limit(10))

In [13]:
// Profiling dataset

// Seeing the ridership distribution among the boroughs
val groupByBorough = MTA_Final.groupBy("borough").count()
z.show(groupByBorough)

In [14]:
// Dataset count

println(MTA_Final.count())