# Importation des librairies et démarrage de la session

In [6]:
from pyspark.sql import SparkSession
import pyspark
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()

# Importation des données et formatage des colonnes

In [62]:
#Importation des deux datasets en csv et suppression des colonnes "month" et "year" de trips car doublon d'une autre colonne
df_trips = spark.read.csv("data/Austin bikes/austin_bikeshare_trips.csv", header=True, inferSchema=True).drop("month","year")
df_stations = spark.read.csv("data/Austin bikes/austin_bikeshare_stations.csv", header=True, inferSchema=True)

#Changement de types de données (pour la plupart, passer de double à integer)
df_trips = df_trips.withColumn("bikeid", df_trips["bikeid"].cast("integer"))\
    .withColumn("trip_id", df_trips["trip_id"].cast("integer"))\
    .withColumn("start_station_id", df_trips["start_station_id"].cast("integer"))\
    .withColumn("end_station_id", df_trips["end_station_id"].cast("integer"))\
    .withColumn("start_datetime", F.to_timestamp(df_trips["start_time"])).drop("start_time")

#Création d'une colonne "date" en plus de la colonne "datetime" pour avoir les deux
df_trips = df_trips.withColumn("date", F.to_date(df_trips["start_datetime"]))

In [63]:
df_trips.show(5)

+------+-------------+----------------+--------------+--------------------+----------------+--------------------+--------------------+----------+-------------------+----------+
|bikeid|checkout_time|duration_minutes|end_station_id|    end_station_name|start_station_id|  start_station_name|     subscriber_type|   trip_id|     start_datetime|      date|
+------+-------------+----------------+--------------+--------------------+----------------+--------------------+--------------------+----------+-------------------+----------+
|     8|     19:12:00|              41|          2565|Trinity & 6th Street|            2536|    Waller & 6th St.|             Walk Up|1310148290|2015-03-19 19:12:00|2015-03-19|
|   141|      2:06:04|               6|          2570|South Congress & ...|            2494|      2nd & Congress|            Local365|  12617682|2016-10-30 02:06:04|2016-10-30|
|   578|     16:28:27|              13|          2498|Convention Center...|            2538|Bullock Museum @ ...|  

In [64]:
df_trips.printSchema()

root
 |-- bikeid: integer (nullable = true)
 |-- checkout_time: string (nullable = true)
 |-- duration_minutes: integer (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- subscriber_type: string (nullable = true)
 |-- trip_id: integer (nullable = true)
 |-- start_datetime: timestamp (nullable = true)
 |-- date: date (nullable = true)



# Traitements (Group by)

In [79]:
#Groupement en faisant la moyenne de la durée des trajets par jour
df_avg_duration_per_day = df_trips.groupBy("date").mean("duration_minutes")
df_avg_duration_per_day.show(5)

+----------+---------------------+
|      date|avg(duration_minutes)|
+----------+---------------------+
|2014-09-26|   23.970178926441353|
|2016-03-01|   24.844282238442823|
|2015-05-19|   20.847031963470318|
|2014-11-12|   13.945121951219512|
|2015-03-09|   138.27272727272728|
+----------+---------------------+
only showing top 5 rows



In [80]:
#Groupement en comptant le nombre de trajets par jour
df_trips_per_start_station = df_trips.groupBy("date","start_station_name").count()
df_trips_per_start_station.show(5)

+----------+--------------------+-----+
|      date|  start_station_name|count|
+----------+--------------------+-----+
|2016-08-30|East 11th St. & S...|    9|
|2016-06-23|San Jacinto & 8th...|    8|
|2014-07-29|         5th & Bowie|   29|
|2017-02-23|UT West Mall @ Gu...|    8|
|2017-07-16|City Hall / Lavac...|   23|
+----------+--------------------+-----+
only showing top 5 rows



In [81]:
#Groupement en comptant le nombre de trajets par jour
df_trips_per_end_station = df_trips.groupBy("date","end_station_name").count()
df_trips_per_end_station.show(5)

+----------+--------------------+-----+
|      date|    end_station_name|count|
+----------+--------------------+-----+
|2017-02-01|Convention Center...|   11|
|2014-05-24|South Congress & ...|   42|
|2015-04-03|South Congress & ...|   12|
|2015-03-11|Rainey St @ Cummings|   19|
|2015-04-01|Republic Square @...|    7|
+----------+--------------------+-----+
only showing top 5 rows



# Output

In [86]:
#On exporte les datasets modifiés et créés en csv
df_trips.repartition(1).write.mode('overwrite').csv('data/Output/df_trips.csv')

df_avg_duration_per_day.repartition(1).write.mode('overwrite').csv('data/Output/df_avg_duration_per_day.csv')

df_trips_per_start_station.repartition(1).write.mode('overwrite').csv('data/Output/df_trips_per_start_station.csv')

df_trips_per_end_station.repartition(1).write.mode('overwrite').csv('data/Output/df_trips_per_end_station.csv')

df_stations.repartition(1).write.mode('overwrite').csv('data/Output/df_stations.csv')