# Create Scaler for Independent variables (X)

In [1]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
data_file = "./RFModelDev/Datasets/final_wholesale_retail_dataset_v0.0.csv"
dataset = pd.read_csv(data_file)

# Display the first few rows of the dataset to get an overview
print(dataset.head())

   year  month     location  wholesale_price  retail_price
0  2017      1  Keppetipola            45.42          98.0
1  2017      1   Kurunegala            61.00         103.0
2  2017      1       Ampara            73.91         124.8
3  2017      1       Matale            68.94         116.4
4  2017      1     Vavuniya            46.19          78.0


In [3]:
# Get unique locations before encoding
unique_locations_before_encoding = dataset['location'].unique()

# Display unique locations before encoding
print("Unique Locations Before Encoding:")
print(unique_locations_before_encoding)

Unique Locations Before Encoding:
['Keppetipola' 'Kurunegala' 'Ampara' 'Matale' 'Vavuniya' 'Mannar' 'Trinco'
 'Matara' 'Bandarawela' 'Dehiattakandiya' 'Nikaweratiya' 'Embilipitiya'
 'Dabulla' 'Hanguranketha' 'Galle' 'Thabuththegama' 'Thissamaharama'
 'Kandy' 'Polonnaruwa' 'Galenbidunuwewa' 'Kilinochchi' 'Kegalle'
 'Nuwara Eliya' 'Rathnapura' 'Mullathivu' 'Jaffna' 'Puttalam' 'Gampaha'
 'Kaluthara' 'Batticaloa' 'Meegoda' 'Badulla' 'Hambanthota' 'Monaragala'
 'Colombo' 'Veyangoda' 'Anuradapuraya']


In [4]:
X = dataset.iloc[:, :-2]  # Select all columns except the last two
y = dataset.iloc[:, -2:]  # Select the last two columns

# Display the first few rows of X and y
print("\nIndependent variables (X):")
print(X.head())

print("\nDependent variables (y):")
print(y.head())


Independent variables (X):
   year  month     location
0  2017      1  Keppetipola
1  2017      1   Kurunegala
2  2017      1       Ampara
3  2017      1       Matale
4  2017      1     Vavuniya

Dependent variables (y):
   wholesale_price  retail_price
0            45.42          98.0
1            61.00         103.0
2            73.91         124.8
3            68.94         116.4
4            46.19          78.0


In [5]:
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X[['location']])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['location']))

## Simplify header names by keeping only city names
X_encoded_df.columns = X_encoded_df.columns.str.split('_').str[-1]
X.drop(columns=['location'], inplace=True)  # Drop the original 'location' column
X = pd.concat([X, X_encoded_df], axis=1)  # Concatenate the encoded 'location' columns

In [6]:
# Display unique locations after encoding
print("\nUnique Locations After Encoding:")
print(X_encoded_df.columns)


Unique Locations After Encoding:
Index(['Ampara', 'Anuradapuraya', 'Badulla', 'Bandarawela', 'Batticaloa',
       'Colombo', 'Dabulla', 'Dehiattakandiya', 'Embilipitiya',
       'Galenbidunuwewa', 'Galle', 'Gampaha', 'Hambanthota', 'Hanguranketha',
       'Jaffna', 'Kaluthara', 'Kandy', 'Kegalle', 'Keppetipola', 'Kilinochchi',
       'Kurunegala', 'Mannar', 'Matale', 'Matara', 'Meegoda', 'Monaragala',
       'Mullathivu', 'Nikaweratiya', 'Nuwara Eliya', 'Polonnaruwa', 'Puttalam',
       'Rathnapura', 'Thabuththegama', 'Thissamaharama', 'Trinco', 'Vavuniya',
       'Veyangoda'],
      dtype='object')


In [7]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Display the first few rows of the updated X and y
print("\nUpdated Independent variables (X) after encoding and standardization:")
print(X.head())



Updated Independent variables (X) after encoding and standardization:
       year     month    Ampara  Anuradapuraya   Badulla  Bandarawela  \
0 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
1 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
2 -1.451288 -1.597845  5.998826      -0.166699 -0.166699    -0.166699   
3 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   
4 -1.451288 -1.597845 -0.166699      -0.166699 -0.166699    -0.166699   

   Batticaloa   Colombo   Dabulla  Dehiattakandiya  ...  Nikaweratiya  \
0   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
1   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
2   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
3   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   
4   -0.166699 -0.166699 -0.166699        -0.166699  ...     -0.166699   

   Nuwara Eliya  Polonnaruwa  Puttalam  Rathnapura 

In [8]:
scaler_filename = './fun1/scaler_X.pkl'
joblib.dump(scaler, scaler_filename)

['./fun1/scaler_X.pkl']