In [29]:
import ConnectionConfig as cc
cc.setupEnvironment()

In [30]:
spark = cc.startLocalCluster("Analyzing")  # default 4
spark.getActiveSession()

In [31]:
#EXTRACT
dim_weather = spark.read.format("delta").load("spark-warehouse/dimWeather")
dim_date = spark.read.format("delta").load("spark-warehouse/dimDate")           
fact_rides = spark.read.format("delta").load("spark-warehouse/factRideS1")

In [32]:
#TRANSFORM
#TEMP VIEW MAKEN  
dim_weather.createOrReplaceTempView("dimWeather")
dim_date.createOrReplaceTempView("dimDate")
fact_rides.createOrReplaceTempView("factRide")

## Wat zijn de drukke momenten (op dagbasis) in de week t.o.v. het weekend? 

Om de drukste momenten van de week te vergelijken met het weekend, kunnen we de gegevens per uur samenvoegen en groeperen op weekdag en weekend.

In [33]:
#Hier is de SQL-query om het gemiddelde aantal ritten per uur voor weekdagen en weekenden te berekenen:
busiest_times_query = spark.sql("""
WITH ride_counts AS (
    SELECT 
           CASE 
            WHEN date_format(dd.calendarDate, 'EEEE') IN ('Saturday', 'Sunday') THEN 'Weekend'
            ELSE 'Weekday'
           END as day_type,
           COUNT(fr.rideSK) as ride_count
    FROM factRide fr
    LEFT JOIN dimDate dd ON fr.dateSK = dd.dateSK
    GROUP BY day_type, dd.calendarDate
)
SELECT 
    day_type,
    AVG(ride_count) as avg_ride_count
FROM ride_counts
GROUP BY day_type
ORDER BY avg_ride_count DESC
""")

#Show the result
df_busiest_times = busiest_times_query.show(10)

# De avondspits (20:00u) is de drukste tijd, zowel weekdagen als in het weekend. In het weekend zijn er echter meer ritten.

+--------+------------------+
|day_type|    avg_ride_count|
+--------+------------------+
| Weekend|3499.0626506024096|
| Weekday|3137.1277617675314|
+--------+------------------+



## Hebben datumparameters invloed op de afgelegde afstand?

Om te bepalen of datumparameters van invloed zijn op de afgelegde afstand, moeten we de relatie analyseren tussen verschillende datumkenmerken (zoals dag van de week, maand, seizoen) en de afgelegde afstand tijdens ritten.

In [34]:
## SQL query to analyze the impact of date parameters on ride distance (or duration in this case)
distance_analysis_query = spark.sql("""
WITH detailed_rides AS (
    SELECT 
        fr.rideSK,
        fr.ride_duration,
        dd.dateSK,
        date_format(dd.calendarDate, 'EEEE') as day_of_week,
        date_format(dd.calendarDate, 'MMMM') as month,
        CASE
          WHEN month(dd.calendarDate) IN (12, 1, 2) THEN 'Winter'
          WHEN month(dd.calendarDate) IN (3, 4, 5) THEN 'Spring'
          WHEN month(dd.calendarDate) IN (6, 7, 8) THEN 'Summer'
          WHEN month(dd.calendarDate) IN (9, 10, 11) THEN 'Autumn'
        END as season
    FROM factRide fr
    LEFT JOIN dimDate dd ON fr.dateSK = dd.dateSK
)
SELECT 
    day_of_week,
    month,
    season,
    AVG(ride_duration) as avg_ride_duration
FROM detailed_rides
GROUP BY day_of_week, month, season
ORDER BY avg_ride_duration DESC
""")

# Show the result
df_distance_analysis = distance_analysis_query.show(10)

#De gegevens geven aan dat ritten doorgaans langer duren in de herfstmaanden september en oktober en de zomermaand augustus. Dit suggereert dat datumparameters, met name specifieke maanden en seizoenen, wel degelijk invloed hebben op de afgelegde afstand.

+-----------+---------+------+-----------------+
|day_of_week|    month|season|avg_ride_duration|
+-----------+---------+------+-----------------+
|   Saturday|September|Autumn|608.4465562810946|
|  Wednesday|     July|Summer|604.3780098105066|
|     Monday|September|Autumn|601.8979079497908|
|    Tuesday|  October|Autumn| 601.441706186041|
|     Friday|  October|Autumn| 600.796260396473|
|   Thursday|September|Autumn|598.7352161355997|
|    Tuesday|     June|Summer|598.7132004936556|
|     Sunday|  October|Autumn|597.8110277412412|
|     Sunday|September|Autumn|596.7837614391196|
|    Tuesday| December|Winter|596.5815187431092|
+-----------+---------+------+-----------------+
only showing top 10 rows



## Heeft weer invloed op ritten?

In [35]:
# SQL query to analyze the impact of weather on rides, filtering out negative durations
weather_analysis_query = spark.sql("""
WITH weather_rides AS (
    SELECT
        fr.rideSK,
        fr.ride_duration,
        dw.temperature_condition,
        dw.weather_condition
    FROM factRide fr
    JOIN dimWeather dw ON fr.weatherSK = dw.weatherSK
    WHERE fr.ride_duration >= 0
)
SELECT
    temperature_condition, 
    weather_condition, 
    COUNT(rideSK) AS rides_per_weather, 
    AVG(ride_duration) AS avg_duration_per_weather
FROM weather_rides
GROUP BY
    temperature_condition, weather_condition
ORDER BY avg_duration_per_weather DESC
""")

# Show the result
df_weather_analysis = weather_analysis_query.show()

#Ja weer heeft invloed op ritten, kijk naar avg_duration_per_weather bij Sunny is minder dan bij Cloudy en Rainy

+---------------------+-----------------+-----------------+------------------------+
|temperature_condition|weather_condition|rides_per_weather|avg_duration_per_weather|
+---------------------+-----------------+-----------------+------------------------+
|         <15 and >-10|           Cloudy|           289645|       653.4118731550691|
|                  <15|            Rainy|           282407|       649.3295916885913|
|                 >=15|            Sunny|           297477|       644.5499416761631|
|                  any|          unknown|          3848295|       579.7117931447564|
+---------------------+-----------------+-----------------+------------------------+



## Hoe varieert het aantal ritten op verschillende weekdagen en onder verschillende weersomstandigheden?

In [36]:
# SQL query to analyze the number of rides across different weekdays under various weather conditions
weekly_weather_ride_query = spark.sql("""
WITH rides_with_weather_date AS (
    SELECT
        fr.rideSK,
        date_format(dd.calendarDate, 'EEEE') AS day_of_week, -- Extracting day of week
        dw.weather_condition
    FROM factRide fr
    JOIN dimWeather dw ON fr.weatherSK = dw.weatherSK
    JOIN dimDate dd ON fr.dateSK = dd.dateSK
)
SELECT
    day_of_week,
    weather_condition,
    COUNT(rideSK) AS rides_per_day_weather
FROM rides_with_weather_date
GROUP BY
    day_of_week, weather_condition
ORDER BY day_of_week, rides_per_day_weather DESC
""")

# Show the result
weekly_weather_ride_query.show()

#Friday,Sunny,34242        Saturday, Sunny ,38515
#Friday,Cloudy,32854        Saturday,Cloudy,37451
#Friday,Rainy,31939         Saturday,Rainy,36399

+-----------+-----------------+---------------------+
|day_of_week|weather_condition|rides_per_day_weather|
+-----------+-----------------+---------------------+
|     Friday|          unknown|               387897|
|     Friday|            Sunny|                34242|
|     Friday|           Cloudy|                32854|
|     Friday|            Rainy|                31939|
|     Monday|          unknown|               665385|
|     Monday|            Sunny|                49533|
|     Monday|           Cloudy|                48124|
|     Monday|            Rainy|                47266|
|   Saturday|          unknown|               504048|
|   Saturday|            Sunny|                38515|
|   Saturday|           Cloudy|                37451|
|   Saturday|            Rainy|                36399|
|     Sunday|          unknown|               689895|
|     Sunday|            Sunny|                49630|
|     Sunday|           Cloudy|                48738|
|     Sunday|            Rai

## Hoe is de verdeling van ritten die onder verschillende weersomstandigheden zijn gestart?

In [37]:
# SQL query to analyze the distribution of rides started at different times of the day under various weather conditions
rides_time_of_day_weather_query = spark.sql("""
WITH rides_with_weather_time AS (
    SELECT
        fr.rideSK,
        dw.weather_condition
    FROM factRide fr
    JOIN dimWeather dw ON fr.weatherSK = dw.weatherSK
    WHERE fr.ride_duration >= 0
)
SELECT
    weather_condition,
    COUNT(rideSK) AS rides_per_hour_weather
FROM rides_with_weather_time
GROUP BY weather_condition
ORDER BY rides_per_hour_weather DESC
""")

# Show the result
rides_time_of_day_weather_query.show(50)

+-----------------+----------------------+
|weather_condition|rides_per_hour_weather|
+-----------------+----------------------+
|          unknown|               3848295|
|            Sunny|                297477|
|           Cloudy|                289645|
|            Rainy|                282407|
+-----------------+----------------------+

