In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.weightstats import ztest as ztest
from scipy.stats import ttest_ind_from_stats as ttest
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
import scipy.stats
from scipy.stats import levene
from scipy.stats import shapiro

Checking the shape of data

In [None]:
df.shape

Checking data types of features

In [None]:
df.info()

As we can see that data type of datetime is an object. We convert it to datetime object.

In [None]:
df_copy = df.copy()

In [None]:
df_copy['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
df_copy.info()

Extracting the date from datetime.

In [None]:
df_copy['date'] = df_copy['datetime'].dt.date

Checking the distribution of data

In [None]:
df.describe()

Checking the null values

In [None]:
df.isnull().sum()

Checking unique values

In [None]:
list_col=['season','holiday','workingday','weather']
for col in list_col:
    print('{} : {} ' . format(col.upper(),df[col].unique()))

Checking the distribution of count of bicycles

In [None]:
sns.boxplot(y='count',data=df)
plt.show()

Distribution of count of bicycles across different seasons

In [None]:
sns.boxplot(x='season',y='count',data=df)
plt.show()

In [None]:
df['season'].value_counts()

In [None]:
df.groupby('season')['count'].describe().T

There is a difference in mean and median due to the presence of outliers.

The median of season 1 is lower as compared to the rest 3 seasons. Whereas there is no significant difference in medians of season 2,3 and 4.

In [None]:
sns.boxplot(x='weather',y='count',data=df)
plt.show()

In [None]:
sns.boxplot(x='workingday',y='count',data=df)
plt.show()

There is no significant difference between median values of working and non-working days

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(df.corr(numeric_only=True),cmap='coolwarm',annot=True)
plt.show()

# Effect of working day on the number of electric cycles rented

Null Hypotheses, Ho: mean of cycles rented on working day is equal to the mean of cycles rented on a non-working day.

Alternate Hypotheses, Ha: mean of cycles rented on working day is not equal to the mean of cycles rented on a non-working day.

Significance level: 0.05

Mean of count of cycles on working and non-working days:

In [None]:
df.groupby('workingday')['count'].mean()

Standard deviation of count of bicycles on working and non-working days:

In [None]:
df.groupby('workingday')['count'].std()

Number of working and non-working days:

In [None]:
x = df_copy.groupby(['date','workingday'])['workingday'].nunique().to_frame()

In [None]:
x.rename({'workingday':'count'},axis=1,inplace=True)

In [None]:
x.reset_index()

In [None]:
x.groupby('workingday')['count'].sum()

### Conducting 2 sample T - test

In [None]:
t_stat, p_value = ttest(188.5,173.72,145,193.012,184.5,311)

critical_value = scipy.stats.t.ppf(0.95,df=145)

print('Test statistic: ',t_stat)
print('P-value: ',p_value)
print('Critical value: ',critical_value)
print("\n")
if t_stat < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence, working day has no effect on the number of cycles rented

# Effect of seasons on the number of electric cycles rented

In [None]:
df.groupby('season')['count'].describe().T

### Checking the assumptions of ANOVA

#### Checking the normality using KDE plot

In [None]:
pd.Series(s1).plot(kind='kde')
plt.show()

In [None]:
pd.Series(s2).plot(kind='kde')
plt.show()

In [None]:
pd.Series(s3).plot(kind='kde')
plt.show()

In [None]:
pd.Series(s4).plot(kind='kde')
plt.show()

#### Checking for normality using shapiro test

In [None]:
test1, p_val1 = shapiro(s1)

print('P-value for weather 1: ',p_val1)

test2, p_val2 = shapiro(s2)

print('P-value for weather 2: ',p_val2)

test3, p_val3= shapiro(s3)

print('P-value for weather 3: ',p_val3)

test4, p_val4= shapiro(s4)

print('P-value for weather 3: ',p_val3)

if p_val1 < 0.05 or p_val2 < 0.05 or p_val3 < 0.05:
    print("Not normal")
else:
    print('Normal')

#### Checking for variances of samples using levene test

In [None]:
test, p_val= levene(s1,s2,s3,s4)

print('Test statistic: ',test)
print('P-value: ',p_val)

if p_val < 0.05:
    print('Variances are unequal')
else:
    print('Variances are equal')

### Both the assumptions of ANOVA failed, but we can give it a try

#### ANOVA Test

Null Hypotheses, Ho: Mean of cycles rented in all seasons is equal.

Alternate Hypotheses, Ha: Mean of cycles rented in all seasons is unequal.

Significance level: 0.05

In [None]:
s1 = df.loc[df['season'] == 1]['count'].values
s2 = df.loc[df['season'] == 2]['count'].values
s3 = df.loc[df['season'] == 3]['count'].values
s4 = df.loc[df['season'] == 4]['count'].values

In [None]:
t_stat,p_value = f_oneway(s1,s2,s3,s4)

print('Test statistic: ',t_stat)
print('P-value: ',p_value)

if p_value < 0.05:
    print('Reject Null hypotheses')
else:
    print('Fail to reject Null hypotheses')

#### Hence, number of cycles rented in different seasons is different

# Effect of weather on the number of electric cycles rented

In [None]:
df.groupby('weather')['count'].describe().T

#### Verifying the assumptions for ANOVA

##### Checking the normality using KDE plot

In [None]:
pd.Series(w1).plot(kind='kde')
plt.show()

In [None]:
pd.Series(w2).plot(kind='kde')
plt.show()

In [None]:
pd.Series(w3).plot(kind='kde')
plt.show()

#### Checking for normality using shapiro test

In [None]:
test1, p_val1 = shapiro(w1)

print('P-value for weather 1: ',p_val1)

test2, p_val2 = shapiro(w2)

print('P-value for weather 2: ',p_val2)

test3, p_val3= shapiro(w3)

print('P-value for weather 3: ',p_val3)\

if p_val1 < 0.05 or p_val2 < 0.05 or p_val3 < 0.05:
    print("Not normal")
else:
    print('Normal')

#### Checking for variances of samples using levene test

In [None]:
test, p_val= levene(w1,w2,w3,w4)

print('Test statistic: ',test)
print('P-value: ',p_val)

if p_val < 0.05:
    print('Variances are unequal')
else:
    print('Variances are equal')

### Both the assumptions of ANOVA failed, but we can give it a try

#### ANOVA Test

Null Hypotheses, Ho: Mean of cycles rented in all weathers is same.

Alternate Hypotheses, Ha: Mean of cycles rented in all weathers is unequal.

Significance level: 0.05

In [None]:
w1 = df.loc[df['weather'] == 1]['count'].values
w2 = df.loc[df['weather'] == 2]['count'].values
w3 = df.loc[df['weather'] == 3]['count'].values
w4 = df.loc[df['weather'] == 4]['count'].values

In [None]:
t_stat,p_value = f_oneway(w1,w2,w3,w4)

print('Test statistic: ',t_stat)
print('P-value: ',p_value)

if p_value < 0.05:
    print('Reject Null hypotheses')
else:
    print('Fail to reject Null hypotheses')

#### Hence, number of cycles rented in different weathers is different

# Check the dependency of weather on season

Null Hypotheses, Ho: Weather is not dependent on season

Alternate Hypotheses, Ha: Weather is dependent on season

Significance level: 0.05

In [None]:
pd.crosstab(df['weather'],df['season'])

In [None]:
data = [[1759,715,211,1],[1801,708,224,0],[1930,604,199,0],[1702,807,225,0]]

### Chi square test

In [None]:
critical_value = scipy.stats.chi2.isf(q=0.05,df=9)

stat,p,dof,expected = chi2_contingency(data)

print('Test statistic: ',stat)
print('P-value: ',p)
print('Degree of freedom: ',dof)
print('Critical value: ',critical_value)
print("\n")

if t_stat > critical_value:
    print('Reject Null hypotheses')
else:
    print('Fail to reject Null hypotheses')

#### Hence, weather is dependent on season