## Step 1 : Import libraries & the dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

In [None]:
# read Bikeshare dataset
df_bike = pd.read_csv("C:\\Users\\Darya\\Desktop\\№7 Lecture support materials, lectures, seminars and labs\\Seminars\\Machine learning\\Seminar 2\\Bike.csv").rename(columns={'cnt' : 'cnt_rental_bike'})

# sample 7 random rows of the dataframe
df_bike.sample(7)

## Step 2 : for the column 'weathersit', replace the values such as :  
1: 'clear'\
2: 'cloudy'\
3: 'light_rain'\
4: 'heavy_rain'

In [None]:
mapping = {
    1: 'clear' ,2: 'cloudy' ,3: 'light_rain' ,4: 'heavy_rain'
}
df_bike['weathersit'] = df_bike['weathersit'].map(mapping)
df_bike

## Step 3 : Using pandas only, explore your dataset

In [None]:
print(df_bike.info())
print()
print(df_bike.describe())
print()
print(df_bike['weathersit'].value_counts())

In [None]:
df_bike.duplicated().sum()

In [None]:
def draw_box_plots(df):
    numerical_cols = df.select_dtypes(include=['number']).columns
    
    
    n = len(numerical_cols)
    n_rows = int(np.ceil(n / 3))
    
    fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5 * n_rows))
    
    axes = axes.flatten()
    
    for i, column in enumerate(numerical_cols):
        sns.boxplot(data=df[column], ax=axes[i])
        axes[i].set_title(f"Boxplot for {column}")
    
    for j in range(i + 1, len(axes)):
        axes[j].axis('off')
    
    plt.tight_layout()
    plt.show()


draw_box_plots(df_bike)

# As we can see: columns casual, registered, cnt_rental_bike have outliers.

## To resolve this, we will use min-max scalling

In [None]:
scaler = MinMaxScaler()
scaled_df = df_bike.copy()

columns_to_scale = ['casual', 'registered', 'cnt_rental_bike']

scaled_df[columns_to_scale] = scaler.fit_transform(scaled_df[columns_to_scale])
draw_box_plots(scaled_df)

# or we may use a standard scaler

In [None]:
scaler = StandardScaler()
scaled_df = df_bike.copy()

columns_to_scale = ['casual', 'registered', 'cnt_rental_bike']

scaled_df[columns_to_scale] = scaler.fit_transform(scaled_df[columns_to_scale])
draw_box_plots(scaled_df)

# Differences between scaling methods:

Min max:
1) Min max scales the values between a specified range, usually [0, 1].
2) This technique is useful when you want to transform the data into a bounded range, typically [0, 1]. It’s often used when the model expects data in a specific range
3) Sensitive to Outliers
4) No Assumptions About Distribution:

Standard: 
1) Standardization does not bound the values to a fixed range.
2) typically used when the data follows a Gaussian (normal) distribution
3) Not Sensitive to Outliers:
4) Assumes Gaussian Distribution:

## Or we may change the outliers to median

In [None]:
def insert_median_values(df, column_name):
    median_from_df = df[column_name].median()
    return median_from_df

def define_big_value_not_normal(df, column):
    inter_quantile = df[column].quantile(0.75) - df[column].quantile(0.25)
    biggest_threshold = df[column].quantile(0.75) + inter_quantile
    smallest_threshold = df[column].quantile(0.25) - inter_quantile
    return smallest_threshold, biggest_threshold

problem_columns = ['casual', 'registered', 'cnt_rental_bike']

for column in problem_columns:
    small_val, big_val = define_big_value_not_normal(df_bike, column)
    if column != 'revenue':
        print(f'In columns {column} detected {len(df_bike[df_bike[column] < small_val])} low-outliers и {len(df_bike[df_bike[column] > big_val])} high-outliers')
        
        df_bike[column] = df_bike[column].apply(
            lambda x: insert_median_values(df_bike, column) 
            if (x < small_val or x > big_val)
            else x
        )

In [None]:
draw_box_plots(df_bike)

## Step 4 : The dataset has several columns related to date/time:

Create a new column `datetime` which will store information about both date and time.

In [None]:
df_bike['dteday'] = pd.to_datetime(df_bike['dteday'])
df_bike['datetime'] = df_bike['dteday'] + pd.to_timedelta(df_bike['hr'], unit='h')
df_bike['datetime']

## Step 5 : Vizualize the seasonality of rental bike

In [None]:
df_bike['dteday'] = pd.to_datetime(df_bike['dteday'])
sns.lineplot(x='dteday', y='cnt_rental_bike', data=df_bike)
plt.xlabel('Date')
plt.ylabel('Bike Rental Count')
plt.title('Bike Rentals Over Time')

## Step 6 : What's the datatype of 'Wheathersit'

In [None]:
print(df_bike['weathersit'].dtype)

## Step 7 : Count the values of Wheathersit and plot its distribution

In [None]:
counter = df_bike['weathersit'].value_counts()
print(counter)
sns.histplot(df_bike['weathersit'])

## Step 8 : What's the number of $K$ in this column.

In [None]:
counter = df_bike['weathersit'].unique()
counter.size

## Step 9 : Implement your own One-Hot-Encoding algorithm. Encode weathersit column

In [None]:
values = df_bike['weathersit'].unique()
encoded_df = df_bike.copy()
for val in values:
    encoded_df[val] = (encoded_df['weathersit'] == val).astype(int)

only_encoded = encoded_df[list(values)].astype(float)
only_encoded.columns = ['weathersit_' + i for i in only_encoded.columns]
only_encoded

## Step 10 : Use scikit-learn OHE encoder for the same column

In [None]:
encoder = OneHotEncoder(sparse_output=False) 
encoded = encoder.fit_transform(df_bike[['weathersit']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['weathersit']))
encoded_df

# copy_df = df_bike.copy()
# copy_df = pd.concat([copy_df, encoded_df], axis=1)
# 
# copy_df

## Step 11 : Compare your encoded columns with the SKLearn ones

In [None]:
((encoded_df['weathersit_clear'] == only_encoded['weathersit_clear']) == False).sum()

In [None]:
((encoded_df['weathersit_cloudy'] == only_encoded['weathersit_cloudy']) == False).sum()

In [None]:
((encoded_df['weathersit_heavy_rain'] == only_encoded['weathersit_heavy_rain']) == False).sum()

In [None]:
((encoded_df['weathersit_light_rain'] == only_encoded['weathersit_light_rain']) == False).sum()

## Step 12 : Modify your algorithm to drop one column while encoding

In [None]:
values = df_bike['weathersit'].unique()
encoded_df = df_bike.copy()
for val in values:
    encoded_df[val] = (encoded_df['weathersit'] == val).astype(int)

only_encoded = encoded_df[list(values[:-1:])].astype(float)
only_encoded.columns = ['weathersit_' + i for i in only_encoded.columns]
only_encoded

## Step 13 : What are advantages and disadvantages of such encoding of a categorical varable? Does the answer depend on whether it is nominal or ordinal?

1) No dependency
2) No bias 
3) Handling Non-Numeric Data
4) Simplicity and Interpretability

For ordinal: 
1) Loss of Information for Ordinal Data (low, medium, high = 0 or 1)
2) Increased Training Time (more features)

## Step 14 : What can be said about linear dependence of the columns produced by one-hot-encoding? Consider two cases: with and without dropping.

Without Dropping a Category Column: 
1) The sum of the one-hot encoded columns will always be 1
2) the columns are not independent
A+B+C=1

With Dropping a Category Column:
1) The columns are linearly dependent. This is because the sum of the one-hot encoded columns for each row is always 1.
2) we eliminate the redundancy


## Step 15 : Repeat the steps 7 to 9, for label encoding

In [None]:
category_counts = only_encoded.apply(pd.Series.value_counts)
print(category_counts)
sns.histplot(only_encoded)

In [None]:
category_counts.index.size