# Getting the data

In [0]:
display(dbutils.fs.ls("/databricks-datasets/bikeSharing/data-001/"))

In [0]:
file_path = "/databricks-datasets/bikeSharing/data-001/hour.csv"

df_raw = spark.read.format("csv") \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .load(file_path)


In [0]:
df_raw.show(10)

# Data cleaning

In [0]:
df_raw.printSchema()

In [0]:
df_raw.sort("dteday", ascending=False).show()

In [0]:
df_raw.select('cnt').distinct().show() #.sort('casual', ascending=False)

# Overview

- 'instant', -> id
- 'dteday', -> date
- 'season', -> 1,2,3,4
- 'yr', -> 0,1, year 0=2011 1=2012
- 'mnth', -> month
- 'hr', -> hour
- 'holiday', -> 0,1 (no, yes)
- 'weekday', -> 0-7
- 'workingday', -> 0,1 (no, yes)
- 'weathersit', -> 1-4
- 'temp', -> 0-1 (float), temperature normalized
- 'atemp', -> 0-1 (float), feels_like_temperature normalized
- 'hum', -> 0-1 (float), humidity normalized
- 'windspeed', -> 0-1 (float), windspeed normalized
- 'casual', -> int, count of casual user
- 'registered', -> int, count of registered user
- 'cnt' -> int, sum of all users

In [0]:
df_raw_renamed = df_raw \
  .withColumnRenamed("instant", "id") \
  .withColumnRenamed("dteday", "date") \
  .withColumnRenamed("yr", "year") \
  .withColumnRenamed("mnth", "month") \
  .withColumnRenamed("hr", "hour") \
  .withColumnRenamed("holiday", "is_holiday") \
  .withColumnRenamed("workingday", "is_workingday") \
  .withColumnRenamed("temp", "temperature_normalized") \
  .withColumnRenamed("atemp", "feels_like_temperature_normalized") \
  .withColumnRenamed("hum", "humidity_normalized") \
  .withColumnRenamed("windspeed", "windspeed_normalized") \
  .withColumnRenamed("weathersit", "weather_condition") \
  .withColumnRenamed("casual", "count_rentals_casual") \
  .withColumnRenamed("registered", "count_rentals_registered") \
  .withColumnRenamed("cnt", "count_rentals_total")

df_raw_renamed.columns