In [3]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Practice').getOrCreate()

In [5]:
df= spark.read.csv('JC-202503-citibike-tripdata (1).csv', header= True)
df.show()

+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+
|         ride_id|rideable_type|          started_at|            ended_at|start_station_name|start_station_id|    end_station_name|end_station_id|         start_lat|         start_lng|           end_lat|           end_lng|member_casual|
+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+
|29DAF43DD84B4B7A|electric_bike|2025-03-20 18:58:...|2025-03-20 19:00:...|   6 St & Grand St|           HB302|Mama Johnson Fiel...|         HB404|40.744397833095604|-74.03450086712837| 40.74313993965626|-74.04004096984863|       member|
|B11B4220F7195025|electric_bike|2025-03-29 11:01:...

In [6]:
df.printSchema()

root
 |-- ride_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- start_lat: string (nullable = true)
 |-- start_lng: string (nullable = true)
 |-- end_lat: string (nullable = true)
 |-- end_lng: string (nullable = true)
 |-- member_casual: string (nullable = true)



In [25]:
from pyspark.sql.types import StructType, StructField, DecimalType, IntegerType, StringType, TimestampType,DateType

from pyspark.sql.functions import lit

In [26]:
schema= StructType(
                   [
                    StructField('ride_id', StringType(),True),
                    StructField('rideable_type',StringType(),True),
                    StructField('started_at', TimestampType(),True),
                    StructField('ended_at',TimestampType(),True),
                    StructField('start_station_name',StringType(),True),
                    StructField('start_station_id',StringType(),True),
                    StructField('end_station_name',StringType(),True),
                    StructField('end_station_id',StringType(),True),
                    StructField('start_lat',DecimalType(),True),
                    StructField('start_lng',DecimalType(),True),
                    StructField('end_lat',DecimalType(),True),
                    StructField('end_lng',DecimalType(),True),
                    StructField('member_casual', StringType(), True)

                   ]
)

In [27]:
df= spark.read.csv('JC-202503-citibike-tripdata (1).csv', schema=schema, header= True)

In [28]:
df.show()
df.printSchema()

+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+---------+---------+-------+-------+-------------+
|         ride_id|rideable_type|          started_at|            ended_at|start_station_name|start_station_id|    end_station_name|end_station_id|start_lat|start_lng|end_lat|end_lng|member_casual|
+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+---------+---------+-------+-------+-------------+
|29DAF43DD84B4B7A|electric_bike|2025-03-20 18:58:...|2025-03-20 19:00:...|   6 St & Grand St|           HB302|Mama Johnson Fiel...|         HB404|       41|      -74|     41|    -74|       member|
|B11B4220F7195025|electric_bike|2025-03-29 11:01:...|2025-03-29 11:11:...|  Heights Elevator|           JC059|        Jersey & 3rd|         JC074|       41|      -74|     41|    -74|       member|
|18D5B30305F602

In [29]:
df.write.format('csv').mode('overwrite').save('bronze_jc_citibike')

In [30]:
df1= spark.read.format('csv').option('header','True').load('/content/JC-202503-citibike-tripdata (1).csv')

df1.show()

+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+
|         ride_id|rideable_type|          started_at|            ended_at|start_station_name|start_station_id|    end_station_name|end_station_id|         start_lat|         start_lng|           end_lat|           end_lng|member_casual|
+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+
|29DAF43DD84B4B7A|electric_bike|2025-03-20 18:58:...|2025-03-20 19:00:...|   6 St & Grand St|           HB302|Mama Johnson Fiel...|         HB404|40.744397833095604|-74.03450086712837| 40.74313993965626|-74.04004096984863|       member|
|B11B4220F7195025|electric_bike|2025-03-29 11:01:...

In [61]:
from pyspark.sql.functions import to_date, col, date_format, to_timestamp,count,aggregate,min,max,avg


In [43]:
df2= df1.withColumn('started_at', to_timestamp('started_at'))\
        .withColumn('ended_at', to_timestamp('ended_at'))\
      .withColumn('trip_start_date', to_date(col('started_at')))\
    .withColumn('trip_duration_mins', (col('ended_at').cast('long')-col('started_at').cast('long'))/60)\
    .withColumn('trip_duration_mins', col('trip_duration_mins').cast(DecimalType(10,2)))



In [44]:
df2.show()

+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+---------------+------------------+
|         ride_id|rideable_type|          started_at|            ended_at|start_station_name|start_station_id|    end_station_name|end_station_id|         start_lat|         start_lng|           end_lat|           end_lng|member_casual|trip_start_date|trip_duration_mins|
+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+---------------+------------------+
|29DAF43DD84B4B7A|electric_bike|2025-03-20 18:58:...|2025-03-20 19:00:...|   6 St & Grand St|           HB302|Mama Johnson Fiel...|         HB404|40.744397833095604|-74.03450086712837|

In [47]:
df2.select('ride_id', 'trip_start_date','started_at', 'ended_at','start_station_name','end_station_name','trip_duration_mins').show()

+----------------+---------------+--------------------+--------------------+------------------+--------------------+------------------+
|         ride_id|trip_start_date|          started_at|            ended_at|start_station_name|    end_station_name|trip_duration_mins|
+----------------+---------------+--------------------+--------------------+------------------+--------------------+------------------+
|29DAF43DD84B4B7A|     2025-03-20|2025-03-20 18:58:...|2025-03-20 19:00:...|   6 St & Grand St|Mama Johnson Fiel...|              2.25|
|B11B4220F7195025|     2025-03-29|2025-03-29 11:01:...|2025-03-29 11:11:...|  Heights Elevator|        Jersey & 3rd|              9.73|
|18D5B30305F602B9|     2025-03-01|2025-03-01 16:05:...|2025-03-01 16:07:...|      Jersey & 3rd|       Hamilton Park|              2.18|
|532EB2D9DB68567D|     2025-03-21|2025-03-21 18:44:...|2025-03-21 18:51:...|      Jersey & 3rd|     Jersey & 6th St|              6.75|
|EA7C9C945D7D57AA|     2025-03-20|2025-03-20 11:

In [54]:
df2.write.format('csv').mode('overwrite').option('header', True).save('silver_jc_citibike')

In [55]:
df3= spark.read.format('csv').option('header','True').load('/content/silver_jc_citibike')

In [56]:
df3.show()

+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+---------------+------------------+
|         ride_id|rideable_type|          started_at|            ended_at|start_station_name|start_station_id|    end_station_name|end_station_id|         start_lat|         start_lng|           end_lat|           end_lng|member_casual|trip_start_date|trip_duration_mins|
+----------------+-------------+--------------------+--------------------+------------------+----------------+--------------------+--------------+------------------+------------------+------------------+------------------+-------------+---------------+------------------+
|29DAF43DD84B4B7A|electric_bike|2025-03-20T18:58:...|2025-03-20T19:00:...|   6 St & Grand St|           HB302|Mama Johnson Fiel...|         HB404|40.744397833095604|-74.03450086712837|

In [68]:
df_final1= df3.groupBy('ride_id','trip_start_date')\
              .agg(
               (count('*').alias('total_trips')),\
              max('trip_duration_mins').alias('max_trip_duration_mins'),\
              min('trip_duration_mins').alias('min_trip_duration_mins'),\
              avg('trip_duration_mins').alias('avg_trip_duration_mins')
              )

In [69]:
df_final1.show()

+----------------+---------------+-----------+----------------------+----------------------+----------------------+
|         ride_id|trip_start_date|total_trips|max_trip_duration_mins|min_trip_duration_mins|avg_trip_duration_mins|
+----------------+---------------+-----------+----------------------+----------------------+----------------------+
|000107582539997A|     2025-03-21|          1|                  3.43|                  3.43|                  3.43|
|00018BF241FE344E|     2025-03-19|          1|                 13.23|                 13.23|                 13.23|
|0004A6F6F9FDFDCE|     2025-03-28|          1|                  2.88|                  2.88|                  2.88|
|00064DC406AD90B7|     2025-03-01|          1|                  4.62|                  4.62|                  4.62|
|00066C5FD84AF884|     2025-03-09|          1|                  9.52|                  9.52|                  9.52|
|0006D9B000E102A9|     2025-03-22|          1|                  8.47|   

In [70]:
df_final2= df3.groupBy('trip_start_date','start_station_name').agg(
                      count('*').alias('total_trips'),
                      avg('trip_duration_mins').alias('avg_trip_duration_mins')


)

In [71]:
df_final2.show()

+---------------+--------------------+-----------+----------------------+
|trip_start_date|  start_station_name|total_trips|avg_trip_duration_mins|
+---------------+--------------------+-----------+----------------------+
|     2025-03-19|Baldwin at Montgo...|         31|     7.542580645161291|
|     2025-03-15|             Hilltop|         13|     8.046153846153846|
|     2025-03-16|     Christ Hospital|         13|    10.386923076923075|
|     2025-03-24|     Christ Hospital|         11|     7.692727272727273|
|     2025-03-10|Hoboken Ave at Mo...|         38|    10.066052631578948|
|     2025-03-04|        Jersey & 3rd|         14|    2.7850000000000006|
|     2025-03-07|    Marin Light Rail|         35|     8.744285714285716|
|     2025-03-17|        Glenwood Ave|         11|     9.038181818181819|
|     2025-03-13|Hoboken Ave at Mo...|         39|     6.483589743589744|
|     2025-03-10|       Fairmount Ave|          8|                4.1475|
|     2025-03-26|Baldwin at Montgo...|

In [72]:
df_final1.write.format('csv').mode('overwrite').save('gold_final1_jc_citibike')

In [73]:
df_final2.write.format('csv').mode('overwrite').save('gold_final2_jc_citibike')