## Sales Forecasting


### Reg No: IT21127946
### Name: Christy H.M

<hr/>

<ul>
    <li><b>Target Variable:</b> Total sales (considering the Total attribute)</li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, Payment, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the total sales amount based on various factors such as the type of products, customer demographics, and purchase details.</li>
</ul>




In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

In [30]:
dataset = pd.read_csv(
    '../dataset/SuperStoreOrders.csv')

In [31]:
dataset

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_name,segment,state,country,market,region,...,category,sub_category,product_name,sales,quantity,discount,profit,shipping_cost,order_priority,year
0,AG-2011-2040,1/1/2011,6/1/2011,Standard Class,Toby Braunhardt,Consumer,Constantine,Algeria,Africa,Africa,...,Office Supplies,Storage,"Tenex Lockers, Blue",408,2,0.0,106.1400,35.46,Medium,2011
1,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Office Supplies,Supplies,"Acme Trimmer, High Speed",120,3,0.1,36.0360,9.72,Medium,2011
2,HU-2011-1220,1/1/2011,5/1/2011,Second Class,Annie Thurman,Consumer,Budapest,Hungary,EMEA,EMEA,...,Office Supplies,Storage,"Tenex Box, Single Width",66,4,0.0,29.6400,8.17,High,2011
3,IT-2011-3647632,1/1/2011,5/1/2011,Second Class,Eugene Moren,Home Office,Stockholm,Sweden,EU,North,...,Office Supplies,Paper,"Enermax Note Cards, Premium",45,3,0.5,-26.0550,4.82,High,2011
4,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",114,5,0.1,37.7700,4.70,Medium,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,CA-2014-115427,31-12-2014,4/1/2015,Standard Class,Erica Bern,Corporate,California,United States,US,West,...,Office Supplies,Binders,"Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl",14,2,0.2,4.5188,0.89,Medium,2014
51286,MO-2014-2560,31-12-2014,5/1/2015,Standard Class,Liz Preis,Consumer,Souss-Massa-Draâ,Morocco,Africa,Africa,...,Office Supplies,Binders,"Wilson Jones Hole Reinforcements, Clear",4,1,0.0,0.4200,0.49,Medium,2014
51287,MX-2014-110527,31-12-2014,2/1/2015,Second Class,Charlotte Melton,Consumer,Managua,Nicaragua,LATAM,Central,...,Office Supplies,Labels,"Hon Color Coded Labels, 5000 Label Set",26,3,0.0,12.3600,0.35,Medium,2014
51288,MX-2014-114783,31-12-2014,6/1/2015,Standard Class,Tamara Dahlen,Consumer,Chihuahua,Mexico,LATAM,North,...,Office Supplies,Labels,"Hon Legal Exhibit Labels, Alphabetical",7,1,0.0,0.5600,0.20,Medium,2014


In [32]:
# Check for missing values
print("\nMissing values:")
print(dataset.isnull().sum())


Missing values:
order_id          0
order_date        0
ship_date         0
ship_mode         0
customer_name     0
segment           0
state             0
country           0
market            0
region            0
product_id        0
category          0
sub_category      0
product_name      0
sales             0
quantity          0
discount          0
profit            0
shipping_cost     0
order_priority    0
year              0
dtype: int64


In [33]:
# Identify categorical variables
categorical_columns = dataset.select_dtypes(include=['object']).columns

In [34]:
categorical_columns

Index(['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_name',
       'segment', 'state', 'country', 'market', 'region', 'product_id',
       'category', 'sub_category', 'product_name', 'sales', 'order_priority'],
      dtype='object')

In [35]:
# Selecting relevant columns
selected_columns = ['segment', 'category', 'sales', 'discount','shipping_cost', 'order_priority', 'ship_mode','market']

In [36]:
# Creating a new DataFrame with selected columns
preprocessed_dataset = dataset[selected_columns].copy()

In [37]:
#Checking preprocessed dataframe
preprocessed_dataset

Unnamed: 0,segment,category,sales,discount,shipping_cost,order_priority,ship_mode,market
0,Consumer,Office Supplies,408,0.0,35.46,Medium,Standard Class,Africa
1,Consumer,Office Supplies,120,0.1,9.72,Medium,Standard Class,APAC
2,Consumer,Office Supplies,66,0.0,8.17,High,Second Class,EMEA
3,Home Office,Office Supplies,45,0.5,4.82,High,Second Class,EU
4,Consumer,Furniture,114,0.1,4.70,Medium,Standard Class,APAC
...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,14,0.2,0.89,Medium,Standard Class,US
51286,Consumer,Office Supplies,4,0.0,0.49,Medium,Standard Class,Africa
51287,Consumer,Office Supplies,26,0.0,0.35,Medium,Second Class,LATAM
51288,Consumer,Office Supplies,7,0.0,0.20,Medium,Standard Class,LATAM


In [38]:
# Handling missing values (if any)
preprocessed_dataset.dropna(inplace=True) 

In [39]:
#Preprocessed data
preprocessed_dataset

Unnamed: 0,segment,category,sales,discount,shipping_cost,order_priority,ship_mode,market
0,Consumer,Office Supplies,408,0.0,35.46,Medium,Standard Class,Africa
1,Consumer,Office Supplies,120,0.1,9.72,Medium,Standard Class,APAC
2,Consumer,Office Supplies,66,0.0,8.17,High,Second Class,EMEA
3,Home Office,Office Supplies,45,0.5,4.82,High,Second Class,EU
4,Consumer,Furniture,114,0.1,4.70,Medium,Standard Class,APAC
...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,14,0.2,0.89,Medium,Standard Class,US
51286,Consumer,Office Supplies,4,0.0,0.49,Medium,Standard Class,Africa
51287,Consumer,Office Supplies,26,0.0,0.35,Medium,Second Class,LATAM
51288,Consumer,Office Supplies,7,0.0,0.20,Medium,Standard Class,LATAM


In [40]:
# Encoding categorical variables using one-hot encoding
preprocessed_dataset = pd.get_dummies(preprocessed_dataset, columns=['segment','category','order_priority','ship_mode','market'])

In [41]:
# Remove non-numeric characters (e.g., commas) and convert 'sales' column to float
preprocessed_dataset['sales'] = preprocessed_dataset['sales'].str.replace(',', '').astype(float)

In [42]:
preprocessed_dataset

Unnamed: 0,sales,discount,shipping_cost,segment_Consumer,segment_Corporate,segment_Home Office,category_Furniture,category_Office Supplies,category_Technology,order_priority_Critical,...,ship_mode_Same Day,ship_mode_Second Class,ship_mode_Standard Class,market_APAC,market_Africa,market_Canada,market_EMEA,market_EU,market_LATAM,market_US
0,408.0,0.0,35.46,True,False,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
1,120.0,0.1,9.72,True,False,False,False,True,False,False,...,False,False,True,True,False,False,False,False,False,False
2,66.0,0.0,8.17,True,False,False,False,True,False,False,...,False,True,False,False,False,False,True,False,False,False
3,45.0,0.5,4.82,False,False,True,False,True,False,False,...,False,True,False,False,False,False,False,True,False,False
4,114.0,0.1,4.70,True,False,False,True,False,False,False,...,False,False,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,14.0,0.2,0.89,False,True,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,True
51286,4.0,0.0,0.49,True,False,False,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
51287,26.0,0.0,0.35,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,True,False
51288,7.0,0.0,0.20,True,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,True,False


In [43]:
# Split features (X) and target variable (y)
X = preprocessed_dataset.drop(columns=['sales'])
y = preprocessed_dataset['sales']

In [44]:
print(preprocessed_dataset.dtypes)


sales                       float64
discount                    float64
shipping_cost               float64
segment_Consumer               bool
segment_Corporate              bool
segment_Home Office            bool
category_Furniture             bool
category_Office Supplies       bool
category_Technology            bool
order_priority_Critical        bool
order_priority_High            bool
order_priority_Low             bool
order_priority_Medium          bool
ship_mode_First Class          bool
ship_mode_Same Day             bool
ship_mode_Second Class         bool
ship_mode_Standard Class       bool
market_APAC                    bool
market_Africa                  bool
market_Canada                  bool
market_EMEA                    bool
market_EU                      bool
market_LATAM                   bool
market_US                      bool
dtype: object


In [47]:
# Use the `to_numeric()` function to convert the column to floats
preprocessed_dataset['sales'] = pd.to_numeric(preprocessed_dataset['sales'], errors='coerce')


In [48]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
from sklearn.linear_model import LinearRegression

# 4. Model Selection
model = LinearRegression()  # Initialize the linear regression model

# 5. Model Training
model.fit(X_train, y_train)  # Train the model using the training data


In [50]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

In [51]:

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [52]:
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 91050.3242033511
Mean Absolute Error (MAE): 120.75709026009572
R-squared (R2): 0.6278194583861129


In [53]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree Regressor model
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Train the Decision Tree Regressor model
decision_tree_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_decision_tree = decision_tree_model.predict(X_test)

# Evaluate the Decision Tree Regressor model
mse_decision_tree = mean_squared_error(y_test, y_pred_decision_tree)
mae_decision_tree = mean_absolute_error(y_test, y_pred_decision_tree)
r2_decision_tree = r2_score(y_test, y_pred_decision_tree)

print("Decision Tree Regressor Metrics:")
print("Mean Squared Error (MSE):", mse_decision_tree)
print("Mean Absolute Error (MAE):", mae_decision_tree)
print("R-squared (R2):", r2_decision_tree)


Decision Tree Regressor Metrics:
Mean Squared Error (MSE): 155268.35632416973
Mean Absolute Error (MAE): 105.82359784233445
R-squared (R2): 0.36531954764746866


In [54]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor model
random_forest_model = RandomForestRegressor(random_state=42)

# Train the Random Forest Regressor model
random_forest_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_random_forest = random_forest_model.predict(X_test)

# Evaluate the Random Forest Regressor model
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)
mae_random_forest = mean_absolute_error(y_test, y_pred_random_forest)
r2_random_forest = r2_score(y_test, y_pred_random_forest)

print("Random Forest Regressor Metrics:")
print("Mean Squared Error (MSE):", mse_random_forest)
print("Mean Absolute Error (MAE):", mae_random_forest)
print("R-squared (R2):", r2_random_forest)


Random Forest Regressor Metrics:
Mean Squared Error (MSE): 77051.12762255473
Mean Absolute Error (MAE): 82.8716884216334
R-squared (R2): 0.6850430719337548


In [56]:
from sklearn.svm import SVR

# Initialize the SVR model
svr_model = SVR()

# Train the SVR model
svr_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_svr = svr_model.predict(X_test)

# Evaluate the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("Support Vector Regression (SVR) Metrics:")
print("Mean Squared Error (MSE):", mse_svr)
print("Mean Absolute Error (MAE):", mae_svr)
print("R-squared (R2):", r2_svr)


Support Vector Regression (SVR) Metrics:
Mean Squared Error (MSE): 136404.51780830792
Mean Absolute Error (MAE): 113.47357137255152
R-squared (R2): 0.44242804448346296


In [57]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the Gradient Boosting Regressor model
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# Train the Gradient Boosting Regressor model
gradient_boosting_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_gradient_boosting = gradient_boosting_model.predict(X_test)

# Evaluate the Gradient Boosting Regressor model
mse_gradient_boosting = mean_squared_error(y_test, y_pred_gradient_boosting)
mae_gradient_boosting = mean_absolute_error(y_test, y_pred_gradient_boosting)
r2_gradient_boosting = r2_score(y_test, y_pred_gradient_boosting)

print("Gradient Boosting Regressor Metrics:")
print("Mean Squared Error (MSE):", mse_gradient_boosting)
print("Mean Absolute Error (MAE):", mae_gradient_boosting)
print("R-squared (R2):", r2_gradient_boosting)


Gradient Boosting Regressor Metrics:
Mean Squared Error (MSE): 66007.2703638256
Mean Absolute Error (MAE): 76.45448392534126
R-squared (R2): 0.730186335420443


In [None]:
# Get the best hyperparameters
# best_params = grid_search.best_params_


# best_params

In [None]:
# y_pred_rf = grid_search.predict(X_test)

In [None]:
# mse_rf = mean_squared_error(y_test, y_pred_rf)
# r2_rf = r2_score(y_test, y_pred_rf)
# mae_rf = mean_absolute_error(y_test, y_pred_rf)

# print("Random Forest Regression Model Performance:")
# print("Mean Squared Error:", mse_rf)
# print("R-squared:", r2_rf)
# print("Mean Absolute Error:", mae_rf)

In [None]:
# # Splitting the data into training and testing sets
# X = preprocessed_dataset.drop(columns=['Total'])
# y = preprocessed_dataset['Total']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# # Creating and training the Decision Tree Regressor model with hyperparameters
# decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
# decision_tree_model.fit(X_train, y_train)

In [None]:
# # Making predictions
# y_pred = decision_tree_model.predict(X_test)

In [None]:
# # Evaluating the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Decision Tree Regression Model Performance:")
# print("Mean Squared Error:", mse)
# print("R-squared:", r2)

In [None]:
# # Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'alpha': [0.01, 0.1, 1, 10, 100]  # Regularization strength
# }


In [None]:
# # Initialize Ridge Regression model
# ridge = Ridge()

In [None]:
# # Initialize Lasso Regression model
# lasso = Lasso()

In [None]:
# # Perform GridSearchCV for Ridge Regression
# ridge_grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
# ridge_grid_search.fit(X_train, y_train)

In [None]:
# # Perform GridSearchCV for Lasso Regression
# lasso_grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
# lasso_grid_search.fit(X_train, y_train)

In [None]:
# # Get the best parameters and best estimator for Ridge Regression
# best_params_ridge = ridge_grid_search.best_params_
# best_estimator_ridge = ridge_grid_search.best_estimator_

In [None]:
# # Get the best parameters and best estimator for Lasso Regression
# best_params_lasso = lasso_grid_search.best_params_
# best_estimator_lasso = lasso_grid_search.best_estimator_

In [None]:
# # Print the best parameters for Ridge Regression
# print("Best Parameters for Ridge Regression:", best_params_ridge)

In [None]:
# # Print the best parameters for Lasso Regression
# print("Best Parameters for Lasso Regression:", best_params_lasso)

In [None]:
# # Predict using the best Ridge Regression model
# y_pred_ridge = best_estimator_ridge.predict(X_test)

In [None]:
# # Calculate R^2 score for Ridge Regression
# r2_ridge = r2_score(y_test, y_pred_ridge)
# print("R^2 score for Ridge Regression:", r2_ridge)

In [None]:
# # Predict using the best Lasso Regression model
# y_pred_lasso = best_estimator_lasso.predict(X_test)

In [None]:
# # Calculate R^2 score for Lasso Regression
# r2_lasso = r2_score(y_test, y_pred_lasso)
# print("R^2 score for Lasso Regression:", r2_lasso)

In [None]:
# For Ridge Regression
# ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
# ridge_mse = mean_squared_error(y_test, y_pred_ridge)
# ridge_rmse = np.sqrt(ridge_mse)
# print("Mean Squared Error for Ridge Regression:", ridge_rmse)
# print("Mean Absolute Error for Ridge Regression:", ridge_rmse)

# # For Lasso Regression
# lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
# lasso_mse = mean_squared_error(y_test, y_pred_lasso)
# lasso_rmse = np.sqrt(lasso_mse)
# print("Mean Squared Error for Lasso Regression:", lasso_rmse)
# print("Mean Absolute Error for Lasso Regression:", lasso_rmse)
