# Data Preprocessing - Data Encoding & Feature Scaling
v2.0 - The script is performing data preprocessing tasks such as data encoding and feature scaling.

### Import the required libraries

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Load the dataset

In [9]:
data_file = "./Datasets/final_wholesale_retail_dataset_v0.0.csv"
dataset = pd.read_csv(data_file)

### Display the first few rows of the dataset to get an overview

In [10]:
print(dataset.head())

   year  month     location  wholesale_price  retail_price
0  2017      1  Keppetipola            45.42          98.0
1  2017      1   Kurunegala            61.00         103.0
2  2017      1       Ampara            73.91         124.8
3  2017      1       Matale            68.94         116.4
4  2017      1     Vavuniya            46.19          78.0


### Separate independent (X) and dependent (y) variables

In [11]:
X = dataset.iloc[:, :-2]  # Select all columns except the last two
y = dataset.iloc[:, -2:]  # Select the last two columns

### Display the first few rows of X and y

In [12]:
print("Independent variables (X):")
print(X.head())

print("\nDependent variables (y):")
print(y.head())

Independent variables (X):
   year  month     location
0  2017      1  Keppetipola
1  2017      1   Kurunegala
2  2017      1       Ampara
3  2017      1       Matale
4  2017      1     Vavuniya

Dependent variables (y):
   wholesale_price  retail_price
0            45.42          98.0
1            61.00         103.0
2            73.91         124.8
3            68.94         116.4
4            46.19          78.0


## One-Hot Encoding for the 'location' column

In [13]:
# One-Hot Encoding for the 'location' column
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[['location']])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['location']))

## Simplify header names by keeping only city names
X_encoded_df.columns = X_encoded_df.columns.str.split('_').str[-1]
X.drop(columns=['location'], inplace=True)  # Drop the original 'location' column
X = pd.concat([X, X_encoded_df], axis=1)  # Concatenate the encoded 'location' columns

## Standardization

In [14]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = pd.DataFrame(scaler.fit_transform(y), columns=y.columns)

## Display the first few rows of the updated X and y

In [18]:
print("Updated Independent variables (X) after encoding and standardization:")
print(X.head())

Updated Independent variables (X) after encoding and standardization:
       year     month    Ampara  Anuradapuraya   Badulla  Bandarawela  \
0 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
1 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
2 -1.451288 -1.597845  5.998826      -0.166699 -0.166699    -0.166699   
3 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
4 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   

   Batticaloa   Colombo   Dabulla  Dehiattakandiya  ...  Nikaweratiya  \
0   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
1   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
2   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
3   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
4   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   

   Nuwara Eliya  Polonnaruwa  Puttalam  Rathnapura  

In [20]:
print("Updated Dependent variables (y) after standardization:")
print(y.head())

Updated Dependent variables (y) after standardization:
   wholesale_price  retail_price
0        -0.404442     -0.121642
1         0.193791      0.018326
2         0.689502      0.628584
3         0.498667      0.393439
4        -0.374876     -0.681512


## Testing inverse transformation for standardization

In [23]:
# Inverse transform to get back to the original scale
y_original = scaler.inverse_transform(y)

----------------