In [1]:
import pandas as pd

In [2]:
# Load the datasets
df_day = pd.read_csv('Bike Sharing Dataset/day.csv')
df_hour = pd.read_csv('Bike Sharing Dataset/hour.csv')

In [3]:
# Add missing 'hr' column to df_day (if not already present)
df_day['hr'] = pd.NA

In [4]:
# Concatenate the datasets vertically (along rows)
df_combined = pd.concat([df_day, df_hour], ignore_index=True)

# Get the column order
cols = df_combined.columns.tolist()

# Move 'hr' column to index 5 (6th column)
# First, remove 'hr' from the column list
cols.remove('hr')
# Then, insert 'hr' at the 6th position (index 5)
cols.insert(5, 'hr')

# Reorder the columns of df_combined
df_combined = df_combined[cols]

# Optionally, save the combined dataset to a new CSV
df_combined.to_csv('combined_dataset_dayhour.csv', index=False)

# Display the combined dataset
print(df_combined)

       instant      dteday  season  yr  mnth   hr  holiday  weekday  \
0            1  2011-01-01       1   0     1  NaN        0        6   
1            2  2011-01-02       1   0     1  NaN        0        0   
2            3  2011-01-03       1   0     1  NaN        0        1   
3            4  2011-01-04       1   0     1  NaN        0        2   
4            5  2011-01-05       1   0     1  NaN        0        3   
...        ...         ...     ...  ..   ...  ...      ...      ...   
18105    17375  2012-12-31       1   1    12   19        0        1   
18106    17376  2012-12-31       1   1    12   20        0        1   
18107    17377  2012-12-31       1   1    12   21        0        1   
18108    17378  2012-12-31       1   1    12   22        0        1   
18109    17379  2012-12-31       1   1    12   23        0        1   

       workingday  weathersit      temp     atemp       hum  windspeed  \
0               0           2  0.344167  0.363625  0.805833   0.160446   

In [5]:
df_combined.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [6]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18110 entries, 0 to 18109
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     18110 non-null  int64  
 1   dteday      18110 non-null  object 
 2   season      18110 non-null  int64  
 3   yr          18110 non-null  int64  
 4   mnth        18110 non-null  int64  
 5   hr          17379 non-null  object 
 6   holiday     18110 non-null  int64  
 7   weekday     18110 non-null  int64  
 8   workingday  18110 non-null  int64  
 9   weathersit  18110 non-null  int64  
 10  temp        18110 non-null  float64
 11  atemp       18110 non-null  float64
 12  hum         18110 non-null  float64
 13  windspeed   18110 non-null  float64
 14  casual      18110 non-null  int64  
 15  registered  18110 non-null  int64  
 16  cnt         18110 non-null  int64  
dtypes: float64(4), int64(11), object(2)
memory usage: 2.3+ MB


In [8]:
df_combined.isna().sum()

instant         0
dteday          0
season          0
yr              0
mnth            0
hr            731
holiday         0
weekday         0
workingday      0
weathersit      0
temp            0
atemp           0
hum             0
windspeed       0
casual          0
registered      0
cnt             0
dtype: int64

In [9]:
print("Jumlah Duplikat : ", df_combined.duplicated().sum())

Jumlah Duplikat :  0


In [11]:
df_combined.describe(include="all")

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,18110.0,18110,18110.0,18110.0,18110.0,17379.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0,18110.0
unique,,731,,,,24.0,,,,,,,,,,,
top,,2011-01-01,,,,17.0,,,,,,,,,,,
freq,,25,,,,730.0,,,,,,,,,,,
mean,8354.006405,,2.501436,0.502485,6.537051,,0.028769,3.003424,0.682772,1.424075,0.496922,0.475718,0.627256,0.190113,68.472336,295.158697,363.631033
std,5180.77186,,1.107045,0.500008,3.439213,,0.16716,2.005677,0.46541,0.63583,0.192177,0.171496,0.191147,0.120851,216.588054,771.555142,950.827954
min,1.0,,1.0,0.0,1.0,,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,3797.25,,2.0,0.0,4.0,,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,37.0,43.0
50%,8324.5,,3.0,1.0,7.0,,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.188951,18.0,123.0,152.0
75%,12851.75,,3.0,1.0,10.0,,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,55.0,238.0,305.0


In [None]:
df_combined.hist()