# Proyek Analisis Data: Bike Sharing Dataset
- Nama: Rizki Laksana Putra
- Email: riskilaksanaputra007@gmail.com
- ID Dicoding: l4ksana

## Menentukan Pertanyaan Bisnis
- Apa saja bulan dan jam tersibuk untuk penyewaan sepeda?
- Apakah ada perbedaan yang signifikan dalam penyewaan sepeda pada hari kerja dan akhir pekan?

## Menyiapkan semua library yang dibutuhkan

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Wrangling

#### Gathering Data

In [2]:
df_day = pd.read_csv('data/day.csv')
df_day.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
df_hour = pd.read_csv('data/hour.csv')
df_hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


#### Assessing Data

In [4]:
df_day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.5+ KB


In [5]:
df_day.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [6]:
df_day.duplicated().sum()

0

In [7]:
df_day.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [8]:
df_hour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [9]:
df_hour.isna().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [10]:
df_hour.duplicated().sum()

0

In [11]:
df_hour.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


- Column instant bisa dihapus karna record index dapat ditampilkan dengan library pandas
- Column dteday memiliki tipe data object (Harus diganti ke tipe data datetime)
- Record dan tipe data column season, yr, mnth, hr, holiday, weekday, workingday, weathersit lebih baik diganti menjadi tipe data categorical supaya mudah dibaca

#### Cleaning Data

In [12]:
df_hour.drop('instant', axis = 1, inplace = True)
df_day.drop('instant', axis = 1, inplace = True)

In [13]:
df_hour.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [14]:
rename = {'dteday': 'date', 'yr': 'year', 'mnth': 'month', 'hr': 'hour', 'hum': 'humidity', 'cnt': 'count'}

df_hour.rename(columns=rename, inplace=True)
df_day.rename(columns=rename, inplace=True)

In [15]:
df_hour.head()

Unnamed: 0,date,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [16]:
categorical = ['season', 'month', 'weekday', 'weathersit', 'holiday', 'workingday']

for df in [df_day, df_hour]:
    df['date'] = pd.to_datetime(df['date'])
    df[categorical] = df[categorical].astype('category')
    
df_day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        731 non-null    datetime64[ns]
 1   season      731 non-null    category      
 2   year        731 non-null    int64         
 3   month       731 non-null    category      
 4   holiday     731 non-null    category      
 5   weekday     731 non-null    category      
 6   workingday  731 non-null    category      
 7   weathersit  731 non-null    category      
 8   temp        731 non-null    float64       
 9   atemp       731 non-null    float64       
 10  humidity    731 non-null    float64       
 11  windspeed   731 non-null    float64       
 12  casual      731 non-null    int64         
 13  registered  731 non-null    int64         
 14  count       731 non-null    int64         
dtypes: category(6), datetime64[ns](1), float64(4), int64(4)
memory usage: 57.1

In [17]:
season_mapping = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'}
for df in [df_day, df_hour]:
    df['season'].replace(season_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [18]:
year_mapping = {0: 2011, 1: 2012}
for df in [df_day, df_hour]:
    df['year'].replace(year_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,2011,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,2011,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,2011,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,2011,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [19]:
month_mapping = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
for df in [df_day, df_hour]:
    df['month'].replace(month_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,January,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,2011,January,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,2011,January,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,2011,January,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,2011,January,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [20]:
holiday_mapping = {0: 'No', 1: 'Yes'}
for df in[df_day, df_hour]:
    df['holiday'].replace(holiday_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,January,No,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,2011,January,No,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,2011,January,No,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,2011,January,No,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,2011,January,No,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [21]:
weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
for df in [df_day, df_hour]:
    df['weekday'].replace(weekday_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,January,No,Saturday,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,2011,January,No,Sunday,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,2011,January,No,Monday,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,2011,January,No,Tuesday,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,2011,January,No,Wednesday,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [22]:
workingday_mapping = {0: 'No', 1: 'Yes'}
for df in[df_day, df_hour]:
    df['workingday'].replace(workingday_mapping, inplace = True)
    
df_day.head()

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,January,No,Saturday,No,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2011-01-02,Winter,2011,January,No,Sunday,No,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2011-01-03,Winter,2011,January,No,Monday,Yes,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2011-01-04,Winter,2011,January,No,Tuesday,Yes,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,2011-01-05,Winter,2011,January,No,Wednesday,Yes,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [23]:
weathersit_mapping = {1: 'Clear', 2: 'Mist', 3: 'Light Rain', 4: 'Heavy Rain'}
for df in [df_day, df_hour]:
    df['weathersit'].replace(weathersit_mapping, inplace = True)
    
df_hour.head()

Unnamed: 0,date,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01,Winter,2011,January,0,No,Saturday,No,Clear,0.24,0.2879,0.81,0.0,3,13,16
1,2011-01-01,Winter,2011,January,1,No,Saturday,No,Clear,0.22,0.2727,0.8,0.0,8,32,40
2,2011-01-01,Winter,2011,January,2,No,Saturday,No,Clear,0.22,0.2727,0.8,0.0,5,27,32
3,2011-01-01,Winter,2011,January,3,No,Saturday,No,Clear,0.24,0.2879,0.75,0.0,3,10,13
4,2011-01-01,Winter,2011,January,4,No,Saturday,No,Clear,0.24,0.2879,0.75,0.0,0,1,1


## Exploratory Data Analysis

In [24]:
df_hour.describe(include = 'all').round()

Unnamed: 0,date,season,year,month,hour,holiday,weekday,workingday,weathersit,temp,atemp,humidity,windspeed,casual,registered,count
count,17379,17379,17379.0,17379,17379.0,17379,17379,17379,17379,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
unique,,4,,12,,2,7,2,4,,,,,,,
top,,Summer,,May,,No,Saturday,Yes,Clear,,,,,,,
freq,,4496,,1488,,16879,2512,11865,11413,,,,,,,
mean,2012-01-02 04:08:34.552045568,,2012.0,,12.0,,,,,0.0,0.0,1.0,0.0,36.0,154.0,189.0
min,2011-01-01 00:00:00,,2011.0,,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2011-07-04 00:00:00,,2011.0,,6.0,,,,,0.0,0.0,0.0,0.0,4.0,34.0,40.0
50%,2012-01-02 00:00:00,,2012.0,,12.0,,,,,0.0,0.0,1.0,0.0,17.0,115.0,142.0
75%,2012-07-02 00:00:00,,2012.0,,18.0,,,,,1.0,1.0,1.0,0.0,48.0,220.0,281.0
max,2012-12-31 00:00:00,,2012.0,,23.0,,,,,1.0,1.0,1.0,1.0,367.0,886.0,977.0


In [25]:
df_hour.groupby(by = 'hour').agg({
    'casual': 'sum',
    'registered': 'sum',
    'count': 'sum'
})

Unnamed: 0_level_0,casual,registered,count
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7375,31755,39130
1,4709,19455,24164
2,3412,12940,16352
3,1893,6281,8174
4,874,3554,4428
5,1012,13249,14261
6,3017,52115,55132
7,8037,146134,154171
8,15761,245240,261001
9,22458,136980,159438


In [26]:
df_hour.groupby(by = 'month').agg({
    'casual': 'sum',
    'registered': 'sum',
    'count': 'sum'
})

  df_hour.groupby(by = 'month').agg({


Unnamed: 0_level_0,casual,registered,count
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
January,12042,122891,134933
February,14963,136389,151352
March,44444,184476,228920
April,60802,208292,269094
May,75285,256401,331686
June,73906,272436,346342
July,78157,266791,344948
August,72039,279155,351194
September,70323,275668,345991
October,59760,262592,322352


In [27]:
df_hour.groupby(by = 'workingday').agg({
    'casual': 'sum',
    'registered': 'sum',
    'count': 'sum'
})

  df_hour.groupby(by = 'workingday').agg({


Unnamed: 0_level_0,casual,registered,count
workingday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,316732,683537,1000269
Yes,303285,1989125,2292410


## Data Visualization