In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count as pyspark_count, when, isnan


In [5]:
spark = SparkSession.builder \
    .appName("Traffic Data Transformation") \
    .config("spark.hadoop.fs.defaultFS", "file:///") \
    .getOrCreate()


In [6]:
data_path='Automated_Traffic_Volume_Counts_20241218.csv'
data_df=spark.read.option('header','true').csv(data_path)

In [7]:
data_df.printSchema()

root
 |-- RequestID: string (nullable = true)
 |-- Boro: string (nullable = true)
 |-- Yr: string (nullable = true)
 |-- M: string (nullable = true)
 |-- D: string (nullable = true)
 |-- HH: string (nullable = true)
 |-- MM: string (nullable = true)
 |-- Vol: string (nullable = true)
 |-- SegmentID: string (nullable = true)
 |-- WktGeom: string (nullable = true)
 |-- street: string (nullable = true)
 |-- fromSt: string (nullable = true)
 |-- toSt: string (nullable = true)
 |-- Direction: string (nullable = true)



In [8]:
data_df.show(5)

+---------+------+----+---+---+---+---+---+---------+--------------------+--------------+--------------------+--------+---------+
|RequestID|  Boro|  Yr|  M|  D| HH| MM|Vol|SegmentID|             WktGeom|        street|              fromSt|    toSt|Direction|
+---------+------+----+---+---+---+---+---+---------+--------------------+--------------+--------------------+--------+---------+
|    32970|Queens|2021|  4| 30|  2|  0|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|
|    32970|Queens|2021|  4| 30|  2| 15|  1|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|
|    32970|Queens|2021|  4| 30|  2| 30|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|
|    32970|Queens|2021|  4| 30|  2| 45|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|
|    32970|Queens|2021|  4| 30|  3|  0|  1|   149701|POINT (997407.099...|PULASKI BRIDGE|N

In [9]:
data_count=data_df.count()
print('Total Records:',data_count)

Total Records: 1712605


In [10]:
from pyspark.sql.functions import col, concat_ws


In [11]:
data_df=data_df.withColumn('Date',concat_ws('-',col('Yr'),col('M'),col('D')))


In [12]:
data_df=data_df.withColumn('Time',concat_ws(':',col("HH"),col("MM")))

In [13]:
data_df = data_df.drop("Yr", "M", "D", "HH", "MM",'Data') 

In [14]:
data_df.show(5)

+---------+------+---+---------+--------------------+--------------+--------------------+--------+---------+---------+----+
|RequestID|  Boro|Vol|SegmentID|             WktGeom|        street|              fromSt|    toSt|Direction|     Date|Time|
+---------+------+---+---------+--------------------+--------------+--------------------+--------+---------+---------+----+
|    32970|Queens|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|2021-4-30| 2:0|
|    32970|Queens|  1|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|2021-4-30|2:15|
|    32970|Queens|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|2021-4-30|2:30|
|    32970|Queens|  0|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|2021-4-30|2:45|
|    32970|Queens|  1|   149701|POINT (997407.099...|PULASKI BRIDGE|Newtown Creek Sho...|Dead end|       NB|2021-4-30| 3:0|
+-------

In [15]:
data_df=data_df.orderBy(col("Date").desc())

In [16]:
data_df.show(10)

+---------+--------+---+---------+--------------------+------------------+---------------+--------------------+---------+--------+-----+
|RequestID|    Boro|Vol|SegmentID|             WktGeom|            street|         fromSt|                toSt|Direction|    Date| Time|
+---------+--------+---+---------+--------------------+------------------+---------------+--------------------+---------+--------+-----+
|    37697|Brooklyn|215|    28962|POINT (990590.819...|   FLATBUSH AVENUE|Atlantic Avenue|Eastern Parkway Line|       SB|2024-6-9|11:15|
|    37697|Brooklyn|199|    28962|POINT (990590.819...|   FLATBUSH AVENUE|Atlantic Avenue|Eastern Parkway Line|       NB|2024-6-9|  0:0|
|    37699|  Queens|126|    75814|POINT (1012773.57...|NORTHERN BOULEVARD|      69 Street|           70 Street|       EB|2024-6-9|  0:0|
|    37697|Brooklyn|108|    28962|POINT (990590.819...|   FLATBUSH AVENUE|Atlantic Avenue|Eastern Parkway Line|       NB|2024-6-9| 2:30|
|    37697|Brooklyn|240|    28962|POINT (

In [17]:
null_counts = data_df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data_df.columns])


NameError: name 'count' is not defined

In [61]:

null_counts = data_df.select([pyspark_count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data_df.columns])


In [62]:
null_counts.show()

+---------+----+---+---------+-------+------+------+----+---------+----+----+
|RequestID|Boro|Vol|SegmentID|WktGeom|street|fromSt|toSt|Direction|Date|Time|
+---------+----+---+---------+-------+------+------+----+---------+----+----+
|        0|   0|  0|        0|      0|     0|     0|1246|        0|   0|   0|
+---------+----+---+---------+-------+------+------+----+---------+----+----+



In [63]:
data_df=data_df.filter(col("toSt").isNotNull())

In [68]:
import os
import shutil
import datetime

In [83]:
data_df.coalesce(1) \
  .write \
  .option("header", "true") \
  .csv("C:\Projects\Advance_Traffic_prediction_NY\local")

  .csv("C:\Projects\Advance_Traffic_prediction_NY\local")
  .csv("C:\Projects\Advance_Traffic_prediction_NY\local")


NameError: name 'df' is not defined