In [1]:



import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsRegressor
import optuna




#Reading the Dataset
df = pd.read_csv('Historical Product Demand.csv')

df

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500
...,...,...,...,...,...
1048570,Product_1791,Whse_J,Category_006,2016/4/27,1000
1048571,Product_1974,Whse_J,Category_006,2016/4/27,1
1048572,Product_1787,Whse_J,Category_006,2016/4/28,2500
1048573,Product_0901,Whse_J,Category_023,2016/10/7,50


In [2]:
# Description of the data
df.describe()



Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
count,1048575,1048575,1048575,1037336,1048575
unique,2160,4,33,1729,3828
top,Product_1359,Whse_J,Category_019,2013/9/27,1000
freq,16936,764447,481099,2075,112682


In [3]:
unique_items = df["Order_Demand"].unique()
print(unique_items)

['100 ' '500 ' '50000 ' ... '(3750)' '(191)' '28250 ']


In [5]:
#Does the dataset include any missing values? If so, drop them!

df = df.dropna()
print("missing values")
print(df.isna().sum())
#no missing values
print(df.dtypes)


# REMOVE '()' from Column Order_Demand
df["Order_Demand"] = df["Order_Demand"].str.replace("(", "")
df["Order_Demand"] = df["Order_Demand"].str.replace(")", "")



missing values
Product_Code        0
Warehouse           0
Product_Category    0
Date                0
Order_Demand        0
dtype: int64
Product_Code        object
Warehouse           object
Product_Category    object
Date                object
Order_Demand        object
dtype: object


In [6]:

#Changing the datatype to datetime
df["Date"] = pd.to_datetime(df['Date'])

# Extract year, month, and day into separate columns
df.loc[:, ('Year')] = df['Date'].dt.year
df.loc[:, ('Month')] = df['Date'].dt.month
df.loc[:, ('Day')] = df['Date'].dt.day
df.loc[:, ('Weekday')] = df['Date'].dt.weekday


#Changing the datatype to float
df["Order_Demand"] = df["Order_Demand"].astype(float)


df = df.sort_values(by=['Date', 'Product_Code'])
df = df.set_index('Date')
df



Unnamed: 0_level_0,Product_Code,Warehouse,Product_Category,Order_Demand,Year,Month,Day,Weekday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-08,Product_0965,Whse_A,Category_006,2.0,2011,1,8,5
2011-05-31,Product_1724,Whse_A,Category_003,108.0,2011,5,31,1
2011-06-24,Product_1521,Whse_S,Category_019,85000.0,2011,6,24,4
2011-06-24,Product_1521,Whse_S,Category_019,7000.0,2011,6,24,4
2011-09-02,Product_1507,Whse_C,Category_019,1250.0,2011,9,2,4
...,...,...,...,...,...,...,...,...
2017-01-06,Product_1970,Whse_J,Category_005,2000.0,2017,1,6,4
2017-01-06,Product_1970,Whse_J,Category_005,2000.0,2017,1,6,4
2017-01-09,Product_0250,Whse_C,Category_007,148.0,2017,1,9,0
2017-01-09,Product_0471,Whse_C,Category_015,30.0,2017,1,9,0


In [7]:

#encoding
df[['Product_Code', 'Warehouse', 'Product_Category','Order_Demand']] = df[['Product_Code', 'Warehouse', 'Product_Category','Order_Demand']].apply(lambda col: LabelEncoder().fit_transform(col))

#Scaling
df[['Product_Code','Product_Category','Order_Demand']] =StandardScaler().fit_transform(df[['Product_Code','Product_Category', 'Order_Demand']])





df

Unnamed: 0_level_0,Product_Code,Warehouse,Product_Category,Order_Demand,Year,Month,Day,Weekday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-08,-0.267594,0,-1.118545,-0.864351,2011,1,8,5
2011-05-31,1.060360,0,-1.496241,-0.744615,2011,5,31,1
2011-06-24,0.705189,3,0.518137,2.350467,2011,6,24,4
2011-06-24,0.705189,3,0.518137,1.551845,2011,6,24,4
2011-09-02,0.680695,1,0.518137,0.366904,2011,9,2,4
...,...,...,...,...,...,...,...,...
2017-01-06,1.489015,2,-1.244444,0.793889,2017,1,6,4
2017-01-06,1.489015,2,-1.244444,0.793889,2017,1,6,4
2017-01-09,-1.501070,1,-0.992647,-0.699431,2017,1,9,0
2017-01-09,-1.114406,1,0.014542,-0.832723,2017,1,9,0


In [8]:
#Feature Selection

numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlations
correlation_matrix = numeric_df.corr()

# Display the correlation matrix
correlation_matrix

Unnamed: 0,Product_Code,Product_Category,Order_Demand
Product_Code,1.0,0.145153,0.187536
Product_Category,0.145153,1.0,0.207794
Order_Demand,0.187536,0.207794,1.0


In [9]:


#Splitting the Data
#Split your data as follows:
#80% training set.
#10% Validation set.
#10% test set.

X = df[df.columns.difference(['Order_Demand','Day','Month','Year','Weekday'])]
y = df["Order_Demand"]


# try on recent data
final_X=X.iloc[-100000:,:]
fibal_Y= y.iloc[-100000:]

X_train, X_remaining, y_train, y_remaining = train_test_split(final_X,fibal_Y, test_size=0.20, random_state=0)

X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=1)


final_X




Unnamed: 0_level_0,Product_Category,Product_Code,Warehouse
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-06-22,0.518137,-1.224631,2
2016-06-22,0.518137,-1.221132,2
2016-06-22,0.518137,-1.221132,2
2016-06-22,-0.992647,-1.217633,0
2016-06-22,-0.992647,-1.217633,2
...,...,...,...
2017-01-06,-1.244444,1.489015,2
2017-01-06,-1.244444,1.489015,2
2017-01-09,-0.992647,-1.501070,1
2017-01-09,0.014542,-1.114406,1


# SVR

In [None]:
#Use an SVR model and train and test for the same data.
from sklearn.svm import SVR


def objective_svr (trial):
    params = {
        'kernel' : trial.suggest_categorical('kernel',['linear', 'poly', 'rbf', 'sigmoid']),
        'C' : trial.suggest_int('C',1, 5)    
    }
    
    model_temp_svr = SVR(**params).fit(X_train, y_train)
    score = model_temp_svr.score(X_test,y_test)
    return score


study_svr = optuna.create_study(direction = "maximize")

study_svr.optimize(objective_svr, n_trials=10)

print(study_svr.best_params)
print(study_svr.best_value)
    

# Random Forest Regressor

In [None]:
#Training Tree-based and SVRs
#Use a decision tree Regression model to train your data.
#Choose the best criterion for the decision tree algorithm by trying different values and validating performance on the validation set.

from sklearn.ensemble import RandomForestRegressor

def objective_rf (trial):
    params = {
        'n_estimators' : trial.suggest_int('n_estimators',100,500,step = 100),
        #'criterion' : trial.suggest_categorical('criterion',['squared_error', 'absolute_error', 'friedman_mse']),
        #'max_depth' : trial.suggest_int('max_depth',1, 10)    
    }
    
    model_temp_rf = RandomForestRegressor(**params).fit(X_train, y_train)
    return model_temp_rf.score(X_test,y_test)


study_rf = optuna.create_study(direction = "maximize")

study_rf.optimize(objective_rf, n_trials=10)

print("best_params:",study_rf.best_params)
print("final accuracy score :",study_rf.best_value)


[I 2024-01-05 14:24:30,196] A new study created in memory with name: no-name-43bbb06c-012e-4fad-ba77-9383fe81f50d
[I 2024-01-05 14:25:50,035] Trial 0 finished with value: 0.6959736054172039 and parameters: {'n_estimators': 500}. Best is trial 0 with value: 0.6959736054172039.
[I 2024-01-05 14:26:38,719] Trial 1 finished with value: 0.6960450217976646 and parameters: {'n_estimators': 300}. Best is trial 1 with value: 0.6960450217976646.
[I 2024-01-05 14:27:09,655] Trial 2 finished with value: 0.6960416315946751 and parameters: {'n_estimators': 200}. Best is trial 1 with value: 0.6960450217976646.
[I 2024-01-05 14:27:43,720] Trial 3 finished with value: 0.6958654819606985 and parameters: {'n_estimators': 200}. Best is trial 1 with value: 0.6960450217976646.
[I 2024-01-05 14:28:30,372] Trial 4 finished with value: 0.6958432149061888 and parameters: {'n_estimators': 300}. Best is trial 1 with value: 0.6960450217976646.


# KNeighbourRegressor

In [10]:
from sklearn.metrics import r2_score
def objective (trial):
    params = {
        'n_neighbors' : trial.suggest_int('n_neighbors',1,50),
        'weights' : trial.suggest_categorical('weights',['uniform', 'distance']),
        'p' : trial.suggest_int('p',1, 2)    
    }
    
    model_temp1 = KNeighborsRegressor(**params).fit(X_train, y_train)
    y_pred = model_temp1.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2


study = optuna.create_study(direction = "maximize")

study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)


[I 2024-01-05 14:22:07,885] A new study created in memory with name: no-name-89494788-bea3-47ea-bf5b-a1c273fe42e8
[I 2024-01-05 14:22:09,093] Trial 0 finished with value: 0.6704325316031035 and parameters: {'n_neighbors': 38, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.6704325316031035.
[I 2024-01-05 14:22:10,038] Trial 1 finished with value: 0.6769410270487835 and parameters: {'n_neighbors': 14, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.6769410270487835.
[I 2024-01-05 14:22:10,969] Trial 2 finished with value: 0.6732041557621244 and parameters: {'n_neighbors': 11, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.6769410270487835.
[I 2024-01-05 14:22:12,151] Trial 3 finished with value: 0.6671949929636343 and parameters: {'n_neighbors': 43, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.6769410270487835.
[I 2024-01-05 14:22:13,056] Trial 4 finished with value: 0.6555538244494807 and parameters: {'n_neighbors': 6, 'weights'

[I 2024-01-05 14:23:01,316] Trial 43 finished with value: 0.6639589436860852 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:02,609] Trial 44 finished with value: 0.694068995796251 and parameters: {'n_neighbors': 50, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:03,915] Trial 45 finished with value: 0.6930520939745692 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:05,140] Trial 46 finished with value: 0.6927867371588546 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:06,423] Trial 47 finished with value: 0.6936744487853242 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:07,583] Trial 48 

[I 2024-01-05 14:23:55,620] Trial 86 finished with value: 0.6939787333442519 and parameters: {'n_neighbors': 49, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:56,903] Trial 87 finished with value: 0.6929082497149794 and parameters: {'n_neighbors': 44, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:23:58,232] Trial 88 finished with value: 0.6936744487853242 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:24:00,284] Trial 89 finished with value: 0.693293114760893 and parameters: {'n_neighbors': 46, 'weights': 'distance', 'p': 2}. Best is trial 11 with value: 0.694068995796251.
[I 2024-01-05 14:24:01,724] Trial 90 finished with value: 0.6940726273690405 and parameters: {'n_neighbors': 50, 'weights': 'distance', 'p': 1}. Best is trial 90 with value: 0.6940726273690405.
[I 2024-01-05 14:24:03,124] Trial 9

{'n_neighbors': 50, 'weights': 'distance', 'p': 1}
0.6940726273690405


# LINEAR REGRESSION

In [11]:
from sklearn.linear_model import LinearRegression


model_lr = LinearRegression().fit(X_train, y_train)


# Evaluate the model
score = model_lr.score(X_test, y_test)
print("R-squared score:", score)

R-squared score: 0.10195108627391125
