# Bike Sharing Demand Basic EDA for Everyone

### This is a basic EDA notebook for everyone(including beginners). If you think it's useful, please upvote. 😊

### I also shared [Top 6.6% Solution Notebook](https://www.kaggle.com/werooring/bike-sharing-demand-top-6-6-solution). It is quite easy so that even beginners can understand

- [Bike Sharing Demand Competition](https://www.kaggle.com/c/bike-sharing-demand)

- [EDA Reference Notebook](https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile)

## Look Around Data

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv("/kaggle/input/train.csv")
test = pd.read_csv("/kaggle/input/test.csv")
submission = pd.read_csv("/kaggle/input/sampleSubmission.csv")

### Check the size of train and test data

In [None]:
train.shape, test.shape

### Print out the first five rows of train, test and submission data 

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

### Grasp the train and test information

In [None]:
train.info()

In [None]:
test.info()

## Pre-feature Engineering for Exploratory Data Analysis

### Extract new features from datetime feature

In [None]:
print(train['datetime'][100]) # 100th datetime 
print(train['datetime'][100].split()) # Divide text by empty space
print(train['datetime'][100].split()[0]) # date
print(train['datetime'][100].split()[1]) # time

In [None]:
print(train['datetime'][100].split()[0]) # date
print(train['datetime'][100].split()[0].split('-')) # Divide strings by '-'
print(train['datetime'][100].split()[0].split('-')[0]) # year
print(train['datetime'][100].split()[0].split('-')[1]) # month
print(train['datetime'][100].split()[0].split('-')[2]) # day

In [None]:
print(train['datetime'][100].split()[1]) # time
print(train['datetime'][100].split()[1].split(':')) # Divide strings by ':'
print(train['datetime'][100].split()[1].split(':')[0]) # hour
print(train['datetime'][100].split()[1].split(':')[1]) # minute
print(train['datetime'][100].split()[1].split(':')[2]) # second

In [None]:
train['date'] = train['datetime'].apply(lambda x: x.split()[0]) # Make date feature
train['year'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[0]) # Make year feature
train['month'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[1]) # Make month feature
train['day'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[2]) # Make day feature
train['hour'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[0]) # Make hour feature
train['minute'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[1]) # Make minute feature
train['second'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[2]) # Make second feature

### Extract days features from data string

In [None]:
from datetime import datetime
import calendar

print(train['date'][100]) # date
print(datetime.strptime(train.date[100], '%Y-%m-%d')) # Change to datetime type
print(datetime.strptime(train.date[100], '%Y-%m-%d').weekday()) # Returns weekday as an integer
print(calendar.day_name[datetime.strptime(train.date[100], '%Y-%m-%d').weekday()]) # Returns weekday as a string

In [None]:
train['weekday'] = train['date'].apply(lambda dateString: calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])

### Change from season, weather feature number to string

In [None]:
train['season'] = train['season'].map({1: 'Spring', 2 : 'Summer', 3 : 'Fall', 4 :'Winter' })
train['weather'] = train['weather'].map({1: 'Clear', \
                                         2: 'Mist, Few clouds', \
                                         3: 'Light Snow, Rain, Thunder', \
                                         4: 'Heavy Snow, Rain, Thunder'})

In [None]:
train.head()

## Visualize Data

### Distribution of count feature

In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

mpl.rc('font', size=15) # Set font size to 15
sns.distplot(train['count']);

### Distribution of log transformation of count feature

In [None]:
sns.distplot(np.log(train['count']));

### Compare before and after `plt.tight_layout()` application

In [None]:
mpl.rc('font', size=14)
mpl.rc('axes', titlesize=15)
figure, axes = plt.subplots(nrows=3, ncols=2)

In [None]:
figure, axes = plt.subplots(nrows=3, ncols=2)
plt.tight_layout()

### Barplot of the average rental counts per year, month, day, hour, minute, and second

In [None]:
# Step 1 : Prepare m rows n columns Figure
mpl.rc('font', size=14)
mpl.rc('axes', titlesize=15)
figure, axes = plt.subplots(nrows=3, ncols=2) # Make Figure in 3rows 2columns
plt.tight_layout(0.1) # Padding between the graphs
figure.set_size_inches(10, 9) # Set total Figure size

# Step 2 : Assign subplot
# Assign each axis an average rental counts barplot per year, month, day, hour, minute, and second
sns.barplot(x='year', y='count', data=train, ax=axes[0, 0])
sns.barplot(x='month', y='count', data=train, ax=axes[0, 1])
sns.barplot(x='day', y='count', data=train, ax=axes[1, 0])
sns.barplot(x='hour', y='count', data=train, ax=axes[1, 1])
sns.barplot(x='minute', y='count', data=train, ax=axes[2, 0])
sns.barplot(x='second', y='count', data=train, ax=axes[2, 1])

# Step 3: Detailed Settings
# 3.1 Set subplots' title
axes[0, 0].set(title='Rental amounts by year')
axes[0, 1].set(title='Rental amounts by month')
axes[1, 0].set(title='Rental amounts by day')
axes[1, 1].set(title='Rental amounts by hour')
axes[2, 0].set(title='Rental amounts by minute')
axes[2, 1].set(title='Rental amounts by second')

# 3.2 Rotate 90 degrees of the x-axis labels of the subplots in row 1
axes[1, 0].tick_params(axis='x', labelrotation=90)
axes[1, 1].tick_params(axis='x', labelrotation=90)

### Boxplot of rental counts per season, weather, holiday, and workingday

In [None]:
# Step 1 : Prepare m rows n columns Figure
figure, axes = plt.subplots(nrows=2, ncols=2) # 2rows 2columns
plt.tight_layout()
figure.set_size_inches(10, 10)

# Step 2 : Assign subplot
# Boxplot of rental counts per season, weather, holiday, and workingday
sns.boxplot(x='season', y='count', data=train, ax=axes[0, 0])
sns.boxplot(x='weather', y='count', data=train, ax=axes[0, 1])
sns.boxplot(x='holiday', y='count', data=train, ax=axes[1, 0])
sns.boxplot(x='workingday', y='count', data=train, ax=axes[1, 1])

# Step 3: Detailed Settings
# 3.1 Set subplots' title
axes[0, 0].set(title='Box Plot On Count Across Season')
axes[0, 1].set(title='Box Plot On Count Across Weather')
axes[1, 0].set(title='Box Plot On Count Across Holiday')
axes[1, 1].set(title='Box Plot On Count Across Working Day')

# 3.2 Fix x-axis labels overlap
axes[0, 1].tick_params('x', labelrotation=10) # rotate 10 degrees

### Pointplot of the average rental counts per hours by workingday, holiday, weekday, season, weather

In [None]:
# Step 1 : Prepare m rows n columns Figure
mpl.rc('font', size=11)
figure, axes = plt.subplots(nrows=5)
figure.set_size_inches(12, 18)

# Step 2 : Assign subplot
# Pointplot of the average rental counts per hours by workingday, holiday, weekday, season, weather
sns.pointplot(x='hour', y='count', data=train, hue='workingday', ax=axes[0])
sns.pointplot(x='hour', y='count', data=train, hue='holiday', ax=axes[1])
sns.pointplot(x='hour', y='count', data=train, hue='weekday', ax=axes[2])
sns.pointplot(x='hour', y='count', data=train, hue='season', ax=axes[3])
sns.pointplot(x='hour', y='count', data=train, hue='weather', ax=axes[4]);

### Regplot of rental counts per temp, atemp, windspeed, humidity

In [None]:
# Step 1 : Prepare m rows n columns Figure
mpl.rc('font', size=15)
figure, axes = plt.subplots(nrows=2, ncols=2) # 2rows 2columns
plt.tight_layout()
figure.set_size_inches(7, 6)

# Step 2 : Assign subplot
# Scatterplot of rental counts per temperature, sensible temperature, wind speed, and humidity
sns.regplot(x='temp', y='count', data=train, ax=axes[0, 0], scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='atemp', y='count', data=train, ax=axes[0, 1], scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='windspeed', y='count', data=train, ax=axes[1, 0], scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='humidity', y='count', data=train, ax=axes[1, 1], scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'});

### Heatmap between numerical data

In [None]:
train[['temp', 'atemp', 'humidity', 'windspeed', 'count']].corr()

In [None]:
corrMatt = train[['temp', 'atemp', 'humidity', 'windspeed', 'count']].corr() # Inter-feature correlation matrix
fig, ax= plt.subplots() 
fig.set_size_inches(10, 10)
sns.heatmap(corrMatt, annot=True) # Plot heatmap
ax.set(title='Heatmap of Numerical Data');