# Assignment 2 Solutions
This notebook contains solutions to the lab questions using the Bike.csv dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler

# Load dataset
df_bike = pd.read_csv("Bike.csv")

# Rename cnt column
df_bike.rename(columns={'cnt':'cnt_rental_bike'}, inplace=True)
df_bike.head()

## Q1: First rows, shape, dtypes

In [None]:
df_bike.head(), df_bike.shape, df_bike.dtypes

## Q2: Missing values

In [None]:
df_bike.isna().sum()

## Q3: Duplicate rows

In [None]:
df_bike.duplicated().sum()

## Q4: Create datetime column

In [None]:
df_bike['datetime'] = pd.to_datetime(df_bike['dteday']) + pd.to_timedelta(df_bike['hr'], unit='h')
df_bike[['dteday','hr','datetime']].head()

## Q5: Basic statistics

In [None]:
df_bike.describe()

## Q6: Rental stats

In [None]:
df_bike['cnt_rental_bike'].agg(['sum','mean','min','max'])

## Q7: Rentals by season

In [None]:
df_bike.groupby('season')['cnt_rental_bike'].mean().plot(kind='bar')
plt.ylabel('Average rentals')
plt.show()

## Q8: Casual vs Registered by hour

In [None]:
df_bike.groupby('hr')[['casual','registered']].mean().plot()
plt.ylabel('Average count')
plt.show()

## Q9: Unique weathersit values

In [None]:
df_bike['weathersit'].unique()
weather = {1:'clear',2:'cloudy',3:'light rain',4:'heavy rain'}
df_bike['weathersit'] = df_bike['weathersit'].map(weather)
df_bike['weathersit'].head()

## Q10: Datatype of weathersit

In [None]:
df_bike['weathersit'].dtype

## Q11: Rentals vs weather

In [1]:
df_bike.groupby('weathersit')['cnt_rental_bike'].mean().plot(kind='bar')
plt.ylabel('Average rentals')
plt.show()

NameError: name 'df_bike' is not defined

## Q12: OneHotEncoder

In [None]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit_transform(df_bike[['weathersit']])[:5]

## Q13: One-hot pros and cons
- Good for nominal
- Increases dimensions
- Loses order info for ordinal

## Q14: Drop one column to avoid dependence

In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df_bike[['weathersit']])[:5]

## Q15: Ordinal encoding example

In [None]:
enc = OrdinalEncoder(categories=[['springer','summer','fall','winter']])
enc.fit_transform(df_bike[['season']].replace({1:'springer',2:'summer',3:'fall',4:'winter'}))[:5]

## Q16: Normalize feature

In [None]:
mm = MinMaxScaler()
df_bike['temp_norm'] = mm.fit_transform(df_bike[['temp']])
df_bike[['temp','temp_norm']].head()

## Q17: Standardize feature

In [None]:
ss = StandardScaler()
df_bike['temp_std'] = ss.fit_transform(df_bike[['temp']])
df_bike[['temp','temp_std']].head()

## Q18: Normalize vs Standardize
- Normalize bounded features like humidity/temp
- Standardize Gaussian-like or when model assumes mean0/var1

# Conceptual Questions

**Q1:** Nominal  
**Q2:** Nominal no order (blood type), Ordinal ordered (education)  
**Q3:** True  
**Q4:** Sum=1 per row, across rows sum=number of rows  
**Q5:** To avoid multicollinearity (dummy variable trap)  
**Q6:** Ordinal encoding for ranks  
**Q7:** High cardinality categories (zip codes)  
**Q8:** Standardization  
**Q9:** False  
**Q10:** Normalize {10,22,27,53}: [0,0.279,0.395,1]  
**Q11:** Z-score (75-50)/10=2.5  
**Q12:** To make features comparable, sensitive models: kNN, SVM, PCA, regression  
