## Sales Forecasting


### Reg No: IT21127946
### Name: Christy H.M

<hr/>

<ul>
    <li><b>Target Variable:</b> Sales</li>
    <li><b>Predictors:</b> Segment, Category, Discount, Shipping cost, Order priority, Ship mode, Market, State, Subcategory and Country</li>
    <li><b>Objective:</b> Predict the total sales amount based on various factors such as the type of products and purchase details.</li>
</ul>




In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder

In [2]:
dataset = pd.read_csv('../dataset/SuperStoreOrders.csv')

In [3]:
dataset

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_name,segment,state,country,market,region,...,category,sub_category,product_name,sales,quantity,discount,profit,shipping_cost,order_priority,year
0,AG-2011-2040,1/1/2011,6/1/2011,Standard Class,Toby Braunhardt,Consumer,Constantine,Algeria,Africa,Africa,...,Office Supplies,Storage,"Tenex Lockers, Blue",408,2,0.0,106.1400,35.46,Medium,2011
1,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Office Supplies,Supplies,"Acme Trimmer, High Speed",120,3,0.1,36.0360,9.72,Medium,2011
2,HU-2011-1220,1/1/2011,5/1/2011,Second Class,Annie Thurman,Consumer,Budapest,Hungary,EMEA,EMEA,...,Office Supplies,Storage,"Tenex Box, Single Width",66,4,0.0,29.6400,8.17,High,2011
3,IT-2011-3647632,1/1/2011,5/1/2011,Second Class,Eugene Moren,Home Office,Stockholm,Sweden,EU,North,...,Office Supplies,Paper,"Enermax Note Cards, Premium",45,3,0.5,-26.0550,4.82,High,2011
4,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",114,5,0.1,37.7700,4.70,Medium,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,CA-2014-115427,31-12-2014,4/1/2015,Standard Class,Erica Bern,Corporate,California,United States,US,West,...,Office Supplies,Binders,"Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl",14,2,0.2,4.5188,0.89,Medium,2014
51286,MO-2014-2560,31-12-2014,5/1/2015,Standard Class,Liz Preis,Consumer,Souss-Massa-Draâ,Morocco,Africa,Africa,...,Office Supplies,Binders,"Wilson Jones Hole Reinforcements, Clear",4,1,0.0,0.4200,0.49,Medium,2014
51287,MX-2014-110527,31-12-2014,2/1/2015,Second Class,Charlotte Melton,Consumer,Managua,Nicaragua,LATAM,Central,...,Office Supplies,Labels,"Hon Color Coded Labels, 5000 Label Set",26,3,0.0,12.3600,0.35,Medium,2014
51288,MX-2014-114783,31-12-2014,6/1/2015,Standard Class,Tamara Dahlen,Consumer,Chihuahua,Mexico,LATAM,North,...,Office Supplies,Labels,"Hon Legal Exhibit Labels, Alphabetical",7,1,0.0,0.5600,0.20,Medium,2014


In [4]:
# Check for missing values
print("\nMissing values:")
print(dataset.isnull().sum())


Missing values:
order_id          0
order_date        0
ship_date         0
ship_mode         0
customer_name     0
segment           0
state             0
country           0
market            0
region            0
product_id        0
category          0
sub_category      0
product_name      0
sales             0
quantity          0
discount          0
profit            0
shipping_cost     0
order_priority    0
year              0
dtype: int64


In [5]:
# Identify categorical variables
categorical_columns = dataset.select_dtypes(include=['object']).columns

In [6]:
categorical_columns

Index(['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_name',
       'segment', 'state', 'country', 'market', 'region', 'product_id',
       'category', 'sub_category', 'product_name', 'sales', 'order_priority'],
      dtype='object')

In [7]:
# Selecting relevant columns
selected_columns = ['segment', 'category', 'discount','shipping_cost', 'order_priority', 'ship_mode','market', 'state', 'sub_category', 'country', 'sales']

In [8]:
# Creating a new DataFrame with selected columns
preprocessed_dataset = dataset[selected_columns].copy()

In [9]:
#Checking preprocessed dataframe
preprocessed_dataset

Unnamed: 0,segment,category,discount,shipping_cost,order_priority,ship_mode,market,state,sub_category,country,sales
0,Consumer,Office Supplies,0.0,35.46,Medium,Standard Class,Africa,Constantine,Storage,Algeria,408
1,Consumer,Office Supplies,0.1,9.72,Medium,Standard Class,APAC,New South Wales,Supplies,Australia,120
2,Consumer,Office Supplies,0.0,8.17,High,Second Class,EMEA,Budapest,Storage,Hungary,66
3,Home Office,Office Supplies,0.5,4.82,High,Second Class,EU,Stockholm,Paper,Sweden,45
4,Consumer,Furniture,0.1,4.70,Medium,Standard Class,APAC,New South Wales,Furnishings,Australia,114
...,...,...,...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,0.2,0.89,Medium,Standard Class,US,California,Binders,United States,14
51286,Consumer,Office Supplies,0.0,0.49,Medium,Standard Class,Africa,Souss-Massa-Draâ,Binders,Morocco,4
51287,Consumer,Office Supplies,0.0,0.35,Medium,Second Class,LATAM,Managua,Labels,Nicaragua,26
51288,Consumer,Office Supplies,0.0,0.20,Medium,Standard Class,LATAM,Chihuahua,Labels,Mexico,7


In [10]:
# Handling missing values (if any)
preprocessed_dataset.dropna(inplace=True) 

In [11]:
#Preprocessed data
preprocessed_dataset

Unnamed: 0,segment,category,discount,shipping_cost,order_priority,ship_mode,market,state,sub_category,country,sales
0,Consumer,Office Supplies,0.0,35.46,Medium,Standard Class,Africa,Constantine,Storage,Algeria,408
1,Consumer,Office Supplies,0.1,9.72,Medium,Standard Class,APAC,New South Wales,Supplies,Australia,120
2,Consumer,Office Supplies,0.0,8.17,High,Second Class,EMEA,Budapest,Storage,Hungary,66
3,Home Office,Office Supplies,0.5,4.82,High,Second Class,EU,Stockholm,Paper,Sweden,45
4,Consumer,Furniture,0.1,4.70,Medium,Standard Class,APAC,New South Wales,Furnishings,Australia,114
...,...,...,...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,0.2,0.89,Medium,Standard Class,US,California,Binders,United States,14
51286,Consumer,Office Supplies,0.0,0.49,Medium,Standard Class,Africa,Souss-Massa-Draâ,Binders,Morocco,4
51287,Consumer,Office Supplies,0.0,0.35,Medium,Second Class,LATAM,Managua,Labels,Nicaragua,26
51288,Consumer,Office Supplies,0.0,0.20,Medium,Standard Class,LATAM,Chihuahua,Labels,Mexico,7


In [12]:
# Encoding categorical variables using one-hot encoding
# preprocessed_dataset = pd.get_dummies(preprocessed_dataset, columns=['segment','category','order_priority','ship_mode','market'])

labelEncoder = LabelEncoder()
preprocessed_dataset['segment'] = labelEncoder.fit_transform(preprocessed_dataset['segment'])
preprocessed_dataset['category'] = labelEncoder.fit_transform(preprocessed_dataset['category'])
preprocessed_dataset['order_priority'] = labelEncoder.fit_transform(preprocessed_dataset['order_priority'])
preprocessed_dataset['ship_mode'] = labelEncoder.fit_transform(preprocessed_dataset['ship_mode'])
preprocessed_dataset['market'] = labelEncoder.fit_transform(preprocessed_dataset['market'])
preprocessed_dataset['state'] = labelEncoder.fit_transform(preprocessed_dataset['state'])
preprocessed_dataset['sub_category'] = labelEncoder.fit_transform(preprocessed_dataset['sub_category'])
preprocessed_dataset['country'] = labelEncoder.fit_transform(preprocessed_dataset['country'])

In [13]:
# Remove non-numeric characters (e.g., commas) and convert 'sales' column to float
preprocessed_dataset['sales'] = preprocessed_dataset['sales'].str.replace(',', '').astype(float)

In [14]:
preprocessed_dataset

Unnamed: 0,segment,category,discount,shipping_cost,order_priority,ship_mode,market,state,sub_category,country,sales
0,0,1,0.0,35.46,3,3,1,255,14,2,408.0
1,0,1,0.1,9.72,3,3,0,702,15,6,120.0
2,0,1,0.0,8.17,1,2,3,175,14,56,66.0
3,2,1,0.5,4.82,1,2,4,939,12,123,45.0
4,0,0,0.1,4.70,3,3,0,702,9,6,114.0
...,...,...,...,...,...,...,...,...,...,...,...
51285,1,1,0.2,0.89,3,3,6,192,3,139,14.0
51286,0,1,0.0,0.49,3,3,1,923,3,85,4.0
51287,0,1,0.0,0.35,3,2,5,599,10,92,26.0
51288,0,1,0.0,0.20,3,3,5,230,10,81,7.0


In [15]:
from sklearn.preprocessing import MinMaxScaler

# Define the numerical columns for normalization
numerical_columns = ['sales', 'discount', 'shipping_cost']

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Apply MinMaxScaler to the numerical columns
preprocessed_dataset[numerical_columns] = scaler.fit_transform(preprocessed_dataset[numerical_columns])

# Display the first few rows of the preprocessed dataset after normalization
print("Preprocessed dataset after min-max normalization:")
print(preprocessed_dataset.head())


Preprocessed dataset after min-max normalization:
   segment  category  discount  shipping_cost  order_priority  ship_mode  \
0        0         1  0.000000       0.037983               3          3   
1        0         1  0.117647       0.010412               3          3   
2        0         1  0.000000       0.008751               1          2   
3        2         1  0.588235       0.005163               1          2   
4        0         0  0.117647       0.005034               3          3   

   market  state  sub_category  country     sales  
0       1    255            14        2  0.018023  
1       0    702            15        6  0.005301  
2       3    175            14       56  0.002915  
3       4    939            12      123  0.001988  
4       0    702             9        6  0.005036  


In [16]:
# Split features (X) and target variable (y)
X = preprocessed_dataset.drop(columns=['sales'])
y = preprocessed_dataset['sales']

In [17]:
print(preprocessed_dataset.dtypes)


segment             int32
category            int32
discount          float64
shipping_cost     float64
order_priority      int32
ship_mode           int32
market              int32
state               int32
sub_category        int32
country             int32
sales             float64
dtype: object


In [18]:
# Use the `to_numeric()` function to convert the column to floats
# preprocessed_dataset['sales'] = pd.to_numeric(preprocessed_dataset['sales'], errors='coerce')


In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.linear_model import LinearRegression

# 4. Model Selection
model = LinearRegression()  # Initialize the linear regression model

# 5. Model Training
model.fit(X_train, y_train)  # Train the model using the training data


In [21]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Evaluate the Linear Regression model on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("LINEAR REGRESSION FOR TRAINING")
print("Mean Squared Error (MSE):", mse_train)
print("Mean Absolute Error (MAE):", mae_train)
print("R-squared (R2) for training set:", r2_train)
print("\n")

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("LINEAR REGRESSION FOR TESTING")
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

LINEAR REGRESSION FOR TRAINING
Mean Squared Error (MSE): 0.00017688748246665235
Mean Absolute Error (MAE): 0.005486804270298816
R-squared (R2) for training set: 0.6158606637999269


LINEAR REGRESSION FOR TESTING
Mean Squared Error (MSE): 0.0001828345997385304
Mean Absolute Error (MAE): 0.00550451020133118
R-squared (R2): 0.6169930422016447


In [22]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree Regressor model
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Train the Decision Tree Regressor model
decision_tree_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_decision_tree = decision_tree_model.predict(X_test)

# Make predictions on the training set
y_train_pred_decision_tree = decision_tree_model.predict(X_train)

# Evaluate the Decision Tree Regressor model on the training set
mse_decision_tree_train = mean_squared_error(y_train, y_train_pred_decision_tree)
mae_decision_tree_train = mean_absolute_error(y_train, y_train_pred_decision_tree)
r2_train_decision_tree = r2_score(y_train, y_train_pred_decision_tree)
print("DECISION TREE REGRESSOR FOR TRAINING")
print("Mean Squared Error (MSE):", mse_decision_tree_train)
print("Mean Absolute Error (MAE):", mae_decision_tree_train)
print("R-squared (R2) for training set:", r2_train_decision_tree)
print("\n")

# Evaluate the Decision Tree Regressor model
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
mae_decision_tree = mean_absolute_error(y_test, y_pred_decision_tree)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)

print("DECISION TREE REGRESSOR FOR TESTING")
print("Mean Squared Error (MSE):", mse_decision_tree)
print("Mean Absolute Error (MAE):", mae_decision_tree)
print("R-squared (R2):", r2_decision_tree)


DECISION TREE REGRESSOR FOR TRAINING
Mean Squared Error (MSE): 1.302189829688238e-09
Mean Absolute Error (MAE): 9.247672097175327e-07
R-squared (R2) for training set: 0.9999971720874208


DECISION TREE REGRESSOR FOR TESTING
Mean Squared Error (MSE): 0.0003097928125812564
Mean Absolute Error (MAE): 0.004678323031146711
R-squared (R2): 0.351037479425519


In [23]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor model
random_forest_model = RandomForestRegressor(random_state=42)

# Train the Random Forest Regressor model
random_forest_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_random_forest = random_forest_model.predict(X_test)

# Make predictions on the training set
y_train_pred_random_forest = random_forest_model.predict(X_train)

# Evaluate the Random Forest Regressor model on the training set
mse_random_forest_train = mean_squared_error(y_train, y_train_pred_random_forest)
mae_random_forest_train = mean_absolute_error(y_train, y_train_pred_random_forest)
r2_train_random_forest = r2_score(y_train, y_train_pred_random_forest)
print("RANDOM FOREST REGRESSOR FOR TRAINING")
print("Mean Squared Error (MSE):", mse_random_forest_train)
print("Mean Absolute Error (MAE):", mae_random_forest_train)
print("R-squared (R2) for training set:", r2_train_random_forest)
print("\n")

# Evaluate the Random Forest Regressor model
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)
mae_random_forest = mean_absolute_error(y_test, y_pred_random_forest)
r2_random_forest = r2_score(y_test, y_pred_random_forest)

print("RANDOM FOREST REGRESSOR FOR TESTING")
print("Mean Squared Error (MSE):", mse_random_forest)
print("Mean Absolute Error (MAE):", mae_random_forest)
print("R-squared (R2):", r2_random_forest)


RANDOM FOREST REGRESSOR FOR TRAINING
Mean Squared Error (MSE): 1.8270326728441953e-05
Mean Absolute Error (MAE): 0.0013276711748108974
R-squared (R2) for training set: 0.9603230760947413


RANDOM FOREST REGRESSOR FOR TESTING
Mean Squared Error (MSE): 0.00014269065367364907
Mean Absolute Error (MAE): 0.0035712138684628482
R-squared (R2): 0.7010876866415903


In [24]:
from sklearn.svm import SVR

# Initialize the SVR model
svr_model = SVR()

# Train the SVR model
svr_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_svr = svr_model.predict(X_test)

# Make predictions on the training set
y_train_pred_svr = svr_model.predict(X_train)

# Evaluate the SVR model on the training set
mse_svr_train = mean_squared_error(y_train, y_train_pred_svr)
mae_svr_train = mean_absolute_error(y_train, y_train_pred_svr)
r2_train_svr = r2_score(y_train, y_train_pred_svr)
print("SUPPORT VECTOR REGRESSION FOR TRAINING")
print("Mean Squared Error (MSE):", mse_svr_train)
print("Mean Absolute Error (MAE):", mae_svr_train)
print("R-squared (R2) for training set:", r2_train_svr)
print("\n")

# Evaluate the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("SUPPORT VECTOR REGRESSION FOR TESTING")
print("Mean Squared Error (MSE):", mse_svr)
print("Mean Absolute Error (MAE):", mae_svr)
print("R-squared (R2):", r2_svr)


SUPPORT VECTOR REGRESSION FOR TRAINING
Mean Squared Error (MSE): 0.008426985057087553
Mean Absolute Error (MAE): 0.09023754156658415
R-squared (R2) for training set: -17.30053998652973


SUPPORT VECTOR REGRESSION FOR TESTING
Mean Squared Error (MSE): 0.008423827761393339
Mean Absolute Error (MAE): 0.09027506287714382
R-squared (R2): -16.646466525059296


In [26]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting Regressor model
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# Train the Gradient Boosting Regressor model
gradient_boosting_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_gradient_boosting = gradient_boosting_model.predict(X_test)

# Make predictions on the training set
y_train_pred_gradient_boosting = gradient_boosting_model.predict(X_train)

# Evaluate the Gradient Boosting Regressor model on the training set
r2_train_gradient_boosting = r2_score(y_train, y_train_pred_gradient_boosting)
mse_gradient_boosting_train = mean_squared_error(y_train, y_train_pred_gradient_boosting)
mae_gradient_boosting_train = mean_absolute_error(y_train, y_train_pred_gradient_boosting)
print("GRADIENT BOOSTING REGRESSOR METRICS FOR TRAINING")
print("Mean Squared Error (MSE) for training set:", mse_gradient_boosting_train)
print("Mean Absolute Error (MAE) for training set:", mae_gradient_boosting_train)
print("R-squared (R2) for training set:", r2_train_gradient_boosting)

# Evaluate the Gradient Boosting Regressor model
mse_gradient_boosting = mean_squared_error(y_test, y_pred_gradient_boosting)
mae_gradient_boosting = mean_absolute_error(y_test, y_pred_gradient_boosting)
r2_gradient_boosting = r2_score(y_test, y_pred_gradient_boosting)
print("\n")
print("GRADIENT BOOSTING REGRESSOR METRICS FOR TESTING")
print("Mean Squared Error (MSE) for testing set:", mse_gradient_boosting)
print("Mean Absolute Error (MAE) for training set:", mae_gradient_boosting)
print("R-squared (R2) for training set:", r2_gradient_boosting)

GRADIENT BOOSTING REGRESSOR METRICS FOR TRAINING
Mean Squared Error (MSE) for training set: 9.28192362321097e-05
Mean Absolute Error (MAE) for training set: 0.0032066506463950023
R-squared (R2) for training set: 0.7984282477448774


GRADIENT BOOSTING REGRESSOR METRICS FOR TESTING
Mean Squared Error (MSE) for testing set: 0.00012578788203731494
Mean Absolute Error (MAE) for training set: 0.003330050072791817
R-squared (R2) for training set: 0.7364960784451704


In [27]:
from sklearn.linear_model import Ridge

# Initialize the Ridge Regression model
ridge_model = Ridge(alpha=1.0, random_state=42)  # You can adjust the regularization strength (alpha) as needed

# Train the Ridge Regression model
ridge_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate the Ridge Regression model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Make predictions on the training set
y_train_pred_ridge = ridge_model.predict(X_train)

# Evaluate the Ridge Regression model on the training set
r2_train_ridge = r2_score(y_train, y_train_pred_ridge)
mse_ridge_train = mean_squared_error(y_train, y_train_pred_ridge)
mae_ridge_train = mean_absolute_error(y_train, y_train_pred_ridge)
print("RIDGE REGRESSION METRICS FOR TRAINING")
print("Mean Squared Error (MSE) for training set:", mse_ridge_train)
print("Mean Absolute Error (MAE) for training set:", mae_ridge_train)
print("R-squared (R2) for training set:", r2_train_ridge)
print("\n")
print("RIDGE REGRESSION METRICS FOR TESTING")
print("Mean Squared Error (MSE) for testing set:", mse_ridge)
print("Mean Absolute Error (MAE) for testing set:", mae_ridge)
print("R-squared (R2) for testing set:", r2_ridge)


RIDGE REGRESSION METRICS FOR TRAINING
Mean Squared Error (MSE) for training set: 0.0001769004509677667
Mean Absolute Error (MAE) for training set: 0.005493054602004255
R-squared (R2) for training set: 0.6158325006346191


RIDGE REGRESSION METRICS FOR TESTING
Mean Squared Error (MSE) for testing set: 0.00018284066243994096
Mean Absolute Error (MAE) for testing set: 0.005511873178410185
R-squared (R2) for testing set: 0.6169803418876636
