In [0]:
val weatherFilePath = "weather-clean.csv"

val weatherDf = spark.read
  .option("header", "true") 
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(weatherFilePath)

weatherDf.printSchema()
weatherDf.show(5, truncate = false)

val busDF = spark.read.parquet("bus-clean.parquet")
val subwayDF = spark.read.parquet("subway-clean.parquet")

busDF.printSchema()
busDF.show(5, truncate = false)

subwayDF.printSchema()
subwayDF.show(5, truncate = false)

In [1]:
val weatherDfWithTimestamp = weatherDf
  .withColumn("datetime_string", concat(col("Date"), lit(" "), col("Hour_of_day")))
  .withColumn("datetime", to_timestamp(col("datetime_string"), "yyyy/M/d H"))
  .drop("datetime_string", "Date", "Hour_of_day")
  .withColumnRenamed(" HeavyFog", "HeavyFog")
  .withColumnRenamed(" Thunder", "Thunder")
  .withColumnRenamed(" IcePellet", "IcePellet")
  .withColumnRenamed(" Hail", "Hail")
  .withColumnRenamed(" Glaze", "Glaze")
  .withColumnRenamed(" Dust", "Dust")
  .withColumnRenamed(" Haze", "Haze")
  .withColumnRenamed(" BlowingSnow", "BlowingSnow")
  .withColumnRenamed(" Tornado", "Tornado")
  .withColumnRenamed(" HighWind", "HighWind")
  .withColumnRenamed(" BlowingSpray", "BlowingSpray")
  .withColumnRenamed(" Mist", "Mist")
  .withColumnRenamed(" Drizzle", "Drizzle")
  .withColumnRenamed(" FreezingDrizzle", "FreezingDrizzle")
  .withColumnRenamed(" Rain", "Rain")
  .withColumnRenamed(" FreezingRain", "FreezingRain")
  .withColumnRenamed(" Snow", "Snow")
  .withColumnRenamed(" UnknownPrecipitation", "UnknownPrecipitation")
  .withColumnRenamed(" GroundFog", "GroundFog")
  .withColumnRenamed(" IceFog", "IceFog")
  .withColumnRenamed(" DailyPrecipitationValue", "DailyPrecipitationValue")
  .withColumnRenamed(" DailySnowDepthValue", "DailySnowDepthValue")
  .withColumnRenamed(" DailySnowfallValue", "DailySnowfallValue")

weatherDfWithTimestamp.printSchema()
weatherDfWithTimestamp.show(5, truncate = false)

In [2]:
val weatherTypes = Seq(
  "Fog", "HeavyFog", "Thunder", "IcePellet", "Hail", "Glaze", "Dust",
  "Haze", "BlowingSnow", "Tornado", "HighWind", "BlowingSpray", "Mist",
  "Drizzle", "FreezingDrizzle", "Rain", "FreezingRain", "Snow",
  "UnknownPrecipitation", "GroundFog", "IceFog"
)

val weathertypeDF = weatherDfWithTimestamp.select(
  col("datetime"),
  coalesce(
    weatherTypes.map(weatherType =>
      when(col(weatherType) === 1, weatherType)
    ): _*
  ).alias("HourlyWeatherType"),
  col("HourlyDryBulbTemperature"), 
  col("HourlyPrecipitationValue"),
  col("DailyPrecipitationValue")
)
weathertypeDF.printSchema()
weathertypeDF.show(5, false)

val Bus_Hourly_Temp_Ridership_DF = busDF
  .withColumnRenamed("transit_timestamp", "datetime")
  .join(weathertypeDF, Seq("datetime"), "inner")
  .select(col("datetime"), col("HourlyDryBulbTemperature").cast("double"), col("total_ridership"))
  
Bus_Hourly_Temp_Ridership_DF.printSchema()
Bus_Hourly_Temp_Ridership_DF.show(5, truncate = false)


val Bus_Hourly_Prec_Ridership_DF = busDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .select(col("datetime"), col("HourlyPrecipitationValue").cast("double"), col("total_ridership"))
  
Bus_Hourly_Prec_Ridership_DF.printSchema()
Bus_Hourly_Prec_Ridership_DF.show(5, truncate = false)


val Bus_Hourly_Weath_Ridership_DF = busDF
  .withColumnRenamed("transit_timestamp", "datetime")
  .join(weathertypeDF, Seq("datetime"), "inner")
  .select(
    col("datetime"),
    col("HourlyWeatherType"),
    col("total_ridership")
  )

Bus_Hourly_Weath_Ridership_DF.printSchema()
Bus_Hourly_Weath_Ridership_DF.show(5, false)


val Bus_Daily_Temp_Ridership_DF = busDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .groupBy(to_date(col("datetime")).as("date"))
  .agg(avg(col("HourlyDryBulbTemperature").cast("double")).as("avg_temperature"), sum(col("total_ridership")).as("total_ridership"))
  .orderBy(col("date").asc)
  
Bus_Daily_Temp_Ridership_DF.printSchema()
Bus_Daily_Temp_Ridership_DF.show(5, truncate = false)


val Bus_Daily_Prec_Ridership_DF = busDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .groupBy(to_date(col("datetime")).as("date"))
  .agg(max(col("DailyPrecipitationValue")).as("daily_precipitation"), sum(col("total_ridership")).as("total_ridership"))
  .orderBy(col("date").asc)
  
Bus_Daily_Prec_Ridership_DF.printSchema()
Bus_Daily_Prec_Ridership_DF.show(5, truncate = false)



// // 地铁数据集
val Metro_Hourly_Temp_Ridership_DF = subwayDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .select(col("datetime"), col("HourlyDryBulbTemperature").cast("double"), col("total_ridership"))
  
Metro_Hourly_Temp_Ridership_DF.printSchema()
Metro_Hourly_Temp_Ridership_DF.show(5, truncate = false)


val Metro_Hourly_Prec_Ridership_DF = subwayDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .select(col("datetime"), col("HourlyPrecipitationValue").cast("double"), col("total_ridership"))
  
Metro_Hourly_Prec_Ridership_DF.printSchema()
Metro_Hourly_Prec_Ridership_DF.show(5, truncate = false)


val Metro_Hourly_Weath_Ridership_DF = subwayDF
  .withColumnRenamed("transit_timestamp", "datetime")
  .join(weathertypeDF, Seq("datetime"), "inner")
  .select(
    col("datetime"),
    col("HourlyWeatherType"),
    col("total_ridership")
  )
  
Metro_Hourly_Weath_Ridership_DF.printSchema()
Metro_Hourly_Weath_Ridership_DF.show(5, truncate = false)


val Metro_Daily_Temp_Ridership_DF = subwayDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .groupBy(to_date(col("datetime")).as("date"))
  .agg(avg(col("HourlyDryBulbTemperature").cast("double")).as("avg_temperature"), sum(col("total_ridership")).as("total_ridership"))
  .orderBy(col("date").asc)
  
Metro_Daily_Temp_Ridership_DF.printSchema()
Metro_Daily_Temp_Ridership_DF.show(5, truncate = false)


val Metro_Daily_Prec_Ridership_DF = subwayDF.withColumnRenamed("transit_timestamp", "datetime").join(weathertypeDF, Seq("datetime"), "inner")
  .groupBy(to_date(col("datetime")).as("date"))
  .agg(max(col("DailyPrecipitationValue")).as("daily_precipitation"), sum(col("total_ridership")).as("total_ridership"))
  .orderBy(col("date").asc)
  
Metro_Daily_Prec_Ridership_DF.printSchema()
Metro_Daily_Prec_Ridership_DF.show(5, truncate = false)

In [3]:
// Bus EDA
// Perform descriptive statistics and distribution analysis for continuous variables such as temperature and precipitation
import org.apache.spark.sql.DataFrame
val temperatureStats = Bus_Hourly_Temp_Ridership_DF
  .select(
    min(col("HourlyDryBulbTemperature")).as("min_temperature"),
    max(col("HourlyDryBulbTemperature")).as("max_temperature"),
    avg(col("HourlyDryBulbTemperature")).as("avg_temperature"),
    stddev(col("HourlyDryBulbTemperature")).as("stddev_temperature"),
    skewness(col("HourlyDryBulbTemperature")).as("skewness_temperature"),
    kurtosis(col("HourlyDryBulbTemperature")).as("kurtosis_temperature")
  )

temperatureStats.show(truncate = false)

val temperatureDistribution = Bus_Hourly_Temp_Ridership_DF
  .groupBy(col("HourlyDryBulbTemperature"))
  .agg(count("*").as("count"))
  .orderBy(col("HourlyDryBulbTemperature").asc)

temperatureDistribution.show(truncate = false)
println("temperatureDistribution DataFrame:")
z.show(temperatureDistribution)

In [4]:
// Bus EDA
// Perform comparative analysis of passenger numbers under different weather types (such as fog, thunderstorms, etc.)
val weatherRidershipAnalysis = Bus_Hourly_Weath_Ridership_DF
  .groupBy(col("HourlyWeatherType"))
  .agg(
    avg(col("total_ridership")).alias("avg_ridership"),
    min(col("total_ridership")).alias("min_ridership"),
    max(col("total_ridership")).alias("max_ridership"),
    count(col("total_ridership")).alias("count")
  )
  .orderBy(col("avg_ridership").desc);

weatherRidershipAnalysis.printSchema();
weatherRidershipAnalysis.show(false);

println("weatherRidershipAnalysis DataFrame:")
z.show(weatherRidershipAnalysis)

In [5]:
// Use line charts to show the changes in the number of passengers at different hours and dates
println("Bus_Hourly_Temp_Ridership_DF DataFrame:")
z.show(Bus_Hourly_Temp_Ridership_DF)

In [6]:
// Use line charts to show the changes in the number of passengers at different dates
println("Bus_Daily_Temp_Ridership_DF DataFrame:")
z.show(Bus_Daily_Temp_Ridership_DF)

In [7]:
// Perform comparative analysis of passenger numbers under different weather types (such as fog, thunderstorms, etc.)
val weatherRidershipAnalysis = Metro_Hourly_Weath_Ridership_DF
  .groupBy(col("HourlyWeatherType"))
  .agg(
    avg(col("total_ridership")).alias("avg_ridership"),
    min(col("total_ridership")).alias("min_ridership"),
    max(col("total_ridership")).alias("max_ridership"),
    count(col("total_ridership")).alias("count")
  )
  .orderBy(col("avg_ridership").desc);

weatherRidershipAnalysis.printSchema();
weatherRidershipAnalysis.show(false);

println("weatherRidershipAnalysis DataFrame:")
z.show(weatherRidershipAnalysis)

In [8]:
// Use line charts to show the changes in the number of passengers at different hours and dates
println("Metro_Hourly_Temp_Ridership_DF DataFrame:")
z.show(Metro_Hourly_Temp_Ridership_DF)

In [9]:
// Use line charts to show the changes in the number of passengers at different dates
println("Metro_Daily_Temp_Ridership_DF DataFrame:")
z.show(Metro_Daily_Temp_Ridership_DF)

In [10]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.functions._

val filteredData = Bus_Hourly_Temp_Ridership_DF.filter(col("HourlyDryBulbTemperature").isNotNull)

val assembler = new VectorAssembler()
  .setInputCols(Array("HourlyDryBulbTemperature"))
  .setOutputCol("features")
  .setHandleInvalid("keep")

val dataWithFeatures = assembler.transform(filteredData)
  .select(col("total_ridership").alias("label"), col("features"))

val Array(trainingData, testData) = dataWithFeatures.randomSplit(Array(0.8, 0.2), seed=42)

val lr = new LinearRegression()

val lrModel = lr.fit(trainingData)

val predictions = lrModel.transform(testData)
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("r2")

val r2 = evaluator.evaluate(predictions)
println(s"R-squared = $r2")

println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

val correlation = filteredData.stat.corr("HourlyDryBulbTemperature", "total_ridership")
println(s"Correlation coefficient: $correlation")

In [11]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.functions._
//bus
val filteredData = Bus_Hourly_Prec_Ridership_DF.filter(col("HourlyPrecipitationValue").isNotNull)
val assembler = new VectorAssembler()
  .setInputCols(Array("HourlyPrecipitationValue"))
  .setOutputCol("features")
  .setHandleInvalid("keep")

val dataWithFeatures = assembler.transform(filteredData)
  .select(col("total_ridership").alias("label"), col("features"))

val Array(trainingData, testData) = dataWithFeatures.randomSplit(Array(0.8, 0.2), seed=42)

val lr = new LinearRegression()

val lrModel = lr.fit(trainingData)

val predictions = lrModel.transform(testData)
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("r2")

val r2 = evaluator.evaluate(predictions)
println(s"R-squared = $r2")

println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

val correlation = filteredData.stat.corr("HourlyPrecipitationValue", "total_ridership")
println(s"Correlation coefficient: $correlation")

In [12]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.functions._
//Metro
val filteredData = Metro_Hourly_Temp_Ridership_DF.filter(col("HourlyDryBulbTemperature").isNotNull)

val assembler = new VectorAssembler()
  .setInputCols(Array("HourlyDryBulbTemperature"))
  .setOutputCol("features")
  .setHandleInvalid("keep")

val dataWithFeatures = assembler.transform(filteredData)
  .select(col("total_ridership").alias("label"), col("features"))

val Array(trainingData, testData) = dataWithFeatures.randomSplit(Array(0.8, 0.2), seed=42)

val lr = new LinearRegression()

val lrModel = lr.fit(trainingData)

val predictions = lrModel.transform(testData)
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("r2")

val r2 = evaluator.evaluate(predictions)
println(s"R-squared = $r2")

println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

val correlation = filteredData.stat.corr("HourlyDryBulbTemperature", "total_ridership")
println(s"Correlation coefficient: $correlation")

In [13]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.functions._
//Metro
val filteredData = Metro_Hourly_Prec_Ridership_DF.filter(col("HourlyPrecipitationValue").isNotNull)
// z.show(filteredData)
val assembler = new VectorAssembler()
  .setInputCols(Array("HourlyPrecipitationValue"))
  .setOutputCol("features")
  .setHandleInvalid("keep")

val dataWithFeatures = assembler.transform(filteredData)
  .select(col("total_ridership").alias("label"), col("features"))

val Array(trainingData, testData) = dataWithFeatures.randomSplit(Array(0.8, 0.2), seed=42)

val lr = new LinearRegression()

val lrModel = lr.fit(trainingData)

val predictions = lrModel.transform(testData)
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("r2")

val r2 = evaluator.evaluate(predictions)
println(s"R-squared = $r2")

println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

val correlation = filteredData.stat.corr("HourlyPrecipitationValue", "total_ridership")
println(s"Correlation coefficient: $correlation")

In [14]:
import org.apache.spark.sql.functions._
//Bus
val weatherTypeStats = Bus_Hourly_Weath_Ridership_DF
  .na.fill("Clear", Seq("HourlyWeatherType")) 
  .groupBy("HourlyWeatherType")
  .agg(
    avg("total_ridership").alias("mean_ridership"),
    min("total_ridership").alias("min_ridership"),
    max("total_ridership").alias("max_ridership"),
    stddev("total_ridership").alias("stddev_ridership")
  )
  .orderBy("mean_ridership")

println("Weather type statistics:")
weatherTypeStats.show()

In [15]:
import org.apache.spark.sql.functions._
//Metro
val weatherTypeStats = Metro_Hourly_Weath_Ridership_DF
  .na.fill("Clear", Seq("HourlyWeatherType")) 
  .groupBy("HourlyWeatherType")
  .agg(
    avg("total_ridership").alias("mean_ridership"),
    min("total_ridership").alias("min_ridership"),
    max("total_ridership").alias("max_ridership"),
    stddev("total_ridership").alias("stddev_ridership")
  )
  .orderBy("mean_ridership")

println("Weather type statistics:")
weatherTypeStats.show()