In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/MyDrive/week2_python_task/london_merged.csv')

In [4]:
df.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


"timestamp" - timestamp field for grouping the data

"cnt" - the count of a new bike shares

"t1" - real temperature in C

"t2" - temperature in C "feels like"

"hum" - humidity in percentage

"wind_speed" - wind speed in km/h

"weather_code" - category of the weather

"is_holiday" - boolean field - 1 holiday / 0 non holiday

"is_weekend" - boolean field - 1 if the day is weekend

"season" - category field meteorological seasons: 0-spring ; 1-summer; 2-fall; 3-winter.

"weathe_code" category description:

1 = Clear ; mostly clear but have some values with haze/fog/patches of fog/ fog in vicinity 2 = scattered clouds / few clouds 3 = Broken clouds 4 = Cloudy 7 = Rain/ light Rain shower/ Light rain 10 = rain with thunderstorm 26 = snowfall 94 = Freezing Fog




In [5]:
df.shape

(17414, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cnt,17414.0,1143.101642,1085.108068,0.0,257.0,844.0,1671.75,7860.0
t1,17414.0,12.468091,5.571818,-1.5,8.0,12.5,16.0,34.0
t2,17414.0,11.520836,6.615145,-6.0,6.0,12.5,16.0,34.0
hum,17414.0,72.324954,14.313186,20.5,63.0,74.5,83.0,100.0
wind_speed,17414.0,15.913063,7.89457,0.0,10.0,15.0,20.5,56.5
weather_code,17414.0,2.722752,2.341163,1.0,1.0,2.0,3.0,26.0
is_holiday,17414.0,0.022051,0.146854,0.0,0.0,0.0,0.0,1.0
is_weekend,17414.0,0.285403,0.451619,0.0,0.0,0.0,1.0,1.0
season,17414.0,1.492075,1.118911,0.0,0.0,1.0,2.0,3.0


In [8]:
df.isnull().sum()

timestamp       0
cnt             0
t1              0
t2              0
hum             0
wind_speed      0
weather_code    0
is_holiday      0
is_weekend      0
season          0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [None]:
# df["is_holiday"] = df["is_holiday"].fillna(df["is_holiday"].mean())  #using mean replace null values
# df["is_holiday"] = df["is_holiday"].fillna(df["is_holiday"].median())  #using median replace null values
# df["is_holiday"] = df["is_holiday"].fillna(df["is_holiday"].std())  #using std replace null values
# df["is_holiday"] = df["is_holiday"].fillna(df["is_holiday"].mean())  #droping coulumns with null values

In [None]:
plt.figure(figsize=(18, 6))
sns.heatmap(df.isnull(), cmap='magma')

plt.title('Heatmap of Missing Values')
plt.xlabel('Features')
plt.ylabel('Data Index')

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df= df.set_index('timestamp')
df['year_month']= df.index.strftime('%Y-%m')
df['year'] = df.index.year
df['month']= df.index.month
df['day_of_week']=df.index.dayofweek
df['hour']=df.index.hour
df.head()

In [None]:
import plotly.express as px

In [None]:
fig = px.bar(x= df['season'].value_counts().index, y=df['season'].value_counts().values, title='Seasons', labels={'y':'Count', 'x':'Seasons'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
fig = px.pie(df, values=df['weather_code'].value_counts().values, names= ['Clear', 'scattered clouds', 'Broken clouds', 'Cloudy' 'Rain', 'rain with thunderstorm', 'snowfall', 'Freezing Fog'])
fig.show()

In [None]:
fig = px.scatter(df, x="year", y="cnt")
fig.show()

In [None]:
fig = px.scatter(df, x="year_month", y="cnt")
fig.show()

In [None]:
holiday = df.groupby('is_holiday')['cnt'].mean().reset_index().rename(columns={'is_holiday': 'Holiday', 'cnt':'Number of Bike Shared'}, )
holiday['Holiday']= holiday['Holiday'].replace({0: 'Normal Day', 1:'Holiday'})

fig = px.bar(holiday, x='Holiday', y= 'Number of Bike Shared', color='Holiday', )
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
weekend = df.groupby('is_weekend')['cnt'].mean().reset_index().rename(columns={'is_weekend': 'Weekend', 'cnt':'Number of Bike Shared'}, )
weekend['Weekend']= weekend['Weekend'].replace({0: 'Weekday', 1:'Weekend'})

fig = px.bar(weekend, x='Weekend', y= 'Number of Bike Shared', color='Weekend', )
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
fig = px.scatter(df, x="hour", y="cnt", color='is_holiday')
fig.show()

In [None]:
fig = px.scatter(df, x="day_of_week", y="cnt", color='is_weekend', hover_data = df[['hour']])
fig.show()

cnt is maximum on the 4th day of the week




In [None]:
df.columns

In [None]:
sns.pairplot(df)
plt.show()

when t1 increases t2 also increases

t1,t2,hum,wind_speed mostly the data is balenced

In [None]:
plt.figure(figsize=(30,12))
sns.heatmap(df.corr(), annot=True)

The target variable cnt depends on,

1) hour

2) month

3) t1, t2

In [None]:
X = df.drop(columns = ["cnt","year_month"], axis=1)
Y = df["cnt"]

In [None]:
X.info()

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X, Y)

# Get importances features
feature_importances = rf_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted DataFrame
print(feature_importance_df)

In [None]:
X.drop(columns=["month","season","year"])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state = 42)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

y_train = sc.fit_transform(Y_train.values.reshape(-1, 1))
y_test = sc.fit_transform(Y_test.values.reshape(-1, 1))

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

class KNNRegressor:
    def __init__(self, K=3):
        self.K = K
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [np.sqrt(np.sum((x_train - x) ** 2)) for x_train in self.X_train]
        # Sort by distance and return indices of the first K neighbors
        k_indices = np.argsort(distances)[:self.K]
        # Extract the labels of the K nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the mean of K nearest labels
        return np.mean(k_nearest_labels)

In [None]:
model = KNNRegressor(K=3)

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import math

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
mape = mean_absolute_percentage_error(y_test,predictions)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean absolute percentage Error (MSPE): {mape}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared: {r2}")

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)

# Train the model
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test,y_pred)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean absolute percentage Error (MSPE): {mape}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared: {r2}")

---------------------------------------Summary---------------------------------

Data set Name - London bike sharing dataset

Data set size - 17414

Data set shape - (17414, 10)

1) Data Cleaning

-> Droping Duplicates  

-> Checking Null Values

-> Filling null values using mode, mean, median, std

2) Data Visualization

  -> cnt is maximum on the 4th day of the week
  -> The target variable cnt depends on,

  1) hour

  2) month

  3) t1, t2
  -> cnt is maximum on the 4th day of the week

 3) Model Building
  
  Model - KNN Regressor

 4) Model BenchMerking
  
  Our model:

Mean Absolute Error (MAE): 0.3922950506911678
s
Mean Squared Error (MSE): 0.4248061861316331

Mean absolute percentage Error (MSPE): 1.4419354770252029

Root Mean Squared Error (RMSE): 0.651771575117873

R-squared: 0.575193813868367

  For Sklearn model:
  
Mean Absolute Error (MAE): 0.40987694252447576

Mean Squared Error (MSE): 0.4264285673125478

Mean absolute percentage Error (MSPE): 1.406702229296142

Root Mean Squared Error (RMSE): 0.6530149824564118

R-squared: 0.5735714326874524
