## Sales Forecasting


### Reg No: IT21127946
### Name: Christy H.M

<hr/>

<ul>
    <li><b>Target Variable:</b> Total sales (considering the Total attribute)</li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, Payment, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the total sales amount based on various factors such as the type of products, customer demographics, and purchase details.</li>
</ul>




In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

In [2]:
dataset = pd.read_csv(
    '../dataset/SuperStoreOrders.csv')

In [3]:
dataset

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_name,segment,state,country,market,region,...,category,sub_category,product_name,sales,quantity,discount,profit,shipping_cost,order_priority,year
0,AG-2011-2040,1/1/2011,6/1/2011,Standard Class,Toby Braunhardt,Consumer,Constantine,Algeria,Africa,Africa,...,Office Supplies,Storage,"Tenex Lockers, Blue",408,2,0.0,106.1400,35.46,Medium,2011
1,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Office Supplies,Supplies,"Acme Trimmer, High Speed",120,3,0.1,36.0360,9.72,Medium,2011
2,HU-2011-1220,1/1/2011,5/1/2011,Second Class,Annie Thurman,Consumer,Budapest,Hungary,EMEA,EMEA,...,Office Supplies,Storage,"Tenex Box, Single Width",66,4,0.0,29.6400,8.17,High,2011
3,IT-2011-3647632,1/1/2011,5/1/2011,Second Class,Eugene Moren,Home Office,Stockholm,Sweden,EU,North,...,Office Supplies,Paper,"Enermax Note Cards, Premium",45,3,0.5,-26.0550,4.82,High,2011
4,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",114,5,0.1,37.7700,4.70,Medium,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,CA-2014-115427,31-12-2014,4/1/2015,Standard Class,Erica Bern,Corporate,California,United States,US,West,...,Office Supplies,Binders,"Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl",14,2,0.2,4.5188,0.89,Medium,2014
51286,MO-2014-2560,31-12-2014,5/1/2015,Standard Class,Liz Preis,Consumer,Souss-Massa-Draâ,Morocco,Africa,Africa,...,Office Supplies,Binders,"Wilson Jones Hole Reinforcements, Clear",4,1,0.0,0.4200,0.49,Medium,2014
51287,MX-2014-110527,31-12-2014,2/1/2015,Second Class,Charlotte Melton,Consumer,Managua,Nicaragua,LATAM,Central,...,Office Supplies,Labels,"Hon Color Coded Labels, 5000 Label Set",26,3,0.0,12.3600,0.35,Medium,2014
51288,MX-2014-114783,31-12-2014,6/1/2015,Standard Class,Tamara Dahlen,Consumer,Chihuahua,Mexico,LATAM,North,...,Office Supplies,Labels,"Hon Legal Exhibit Labels, Alphabetical",7,1,0.0,0.5600,0.20,Medium,2014


In [17]:
# Selecting relevant columns
selected_columns = ['segment', 'category', 'sales', 'discount', 
                    'shipping_cost', 'order_priority', 'ship_mode','market']

In [18]:
# Creating a new DataFrame with selected columns
preprocessed_dataset = dataset[selected_columns].copy()

In [19]:
#Checking preprocessed dataframe
preprocessed_dataset

Unnamed: 0,segment,category,sales,discount,shipping_cost,order_priority,ship_mode,market
0,Consumer,Office Supplies,408,0.0,35.46,Medium,Standard Class,Africa
1,Consumer,Office Supplies,120,0.1,9.72,Medium,Standard Class,APAC
2,Consumer,Office Supplies,66,0.0,8.17,High,Second Class,EMEA
3,Home Office,Office Supplies,45,0.5,4.82,High,Second Class,EU
4,Consumer,Furniture,114,0.1,4.70,Medium,Standard Class,APAC
...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,14,0.2,0.89,Medium,Standard Class,US
51286,Consumer,Office Supplies,4,0.0,0.49,Medium,Standard Class,Africa
51287,Consumer,Office Supplies,26,0.0,0.35,Medium,Second Class,LATAM
51288,Consumer,Office Supplies,7,0.0,0.20,Medium,Standard Class,LATAM


In [20]:
# Handling missing values (if any)
preprocessed_dataset.dropna(inplace=True) 

In [21]:
#Preprocessed data
preprocessed_dataset

Unnamed: 0,segment,category,sales,discount,shipping_cost,order_priority,ship_mode,market
0,Consumer,Office Supplies,408,0.0,35.46,Medium,Standard Class,Africa
1,Consumer,Office Supplies,120,0.1,9.72,Medium,Standard Class,APAC
2,Consumer,Office Supplies,66,0.0,8.17,High,Second Class,EMEA
3,Home Office,Office Supplies,45,0.5,4.82,High,Second Class,EU
4,Consumer,Furniture,114,0.1,4.70,Medium,Standard Class,APAC
...,...,...,...,...,...,...,...,...
51285,Corporate,Office Supplies,14,0.2,0.89,Medium,Standard Class,US
51286,Consumer,Office Supplies,4,0.0,0.49,Medium,Standard Class,Africa
51287,Consumer,Office Supplies,26,0.0,0.35,Medium,Second Class,LATAM
51288,Consumer,Office Supplies,7,0.0,0.20,Medium,Standard Class,LATAM


In [22]:
# Encoding categorical variables using one-hot encoding
preprocessed_dataset = pd.get_dummies(preprocessed_dataset, columns=['segment', 'category', 'sales', 'discount', 'shipping_cost','order_priority','ship_mode','market'])

In [34]:
# Scaling numerical features (optional, depending on the model you choose)

# scaler = StandardScaler()
# preprocessed_dataset[['Quantity', 'cogs', 'gross income']] = scaler.fit_transform(preprocessed_dataset[['Quantity', 'cogs', 'gross income']])

In [35]:
# # Convert boolean columns to 0 and 1
# boolean_columns = ['Branch_A', 'Branch_B', 'Branch_C',
#                    'Customer type_Member', 'Customer type_Normal',
#                    'Gender_Female', 'Gender_Male',
#                    'Product line_Electronic accessories', 'Product line_Fashion accessories',
#                    'Product line_Food and beverages', 'Product line_Health and beauty',
#                    'Product line_Home and lifestyle', 'Product line_Sports and travel',
#                    'Payment_Cash', 'Payment_Credit card', 'Payment_Ewallet']

# preprocessed_dataset[boolean_columns] = preprocessed_dataset[boolean_columns].astype(int)

In [36]:
# # Displaying the preprocessed dataset
# print(preprocessed_dataset.head())

   Quantity      cogs  gross income     Total  Branch_A  Branch_B  Branch_C  \
0  0.509930  0.919607      0.919607  548.9715         1         0         0   
1 -0.174540 -0.987730     -0.987730   80.2200         0         0         1   
2  0.509930  0.071446      0.071446  340.5255         1         0         0   
3  0.852165  0.675780      0.675780  489.0480         1         0         0   
4  0.509930  1.267125      1.267125  634.3785         1         0         0   

   Customer type_Member  Customer type_Normal  Gender_Female  Gender_Male  \
0                     1                     0              1            0   
1                     0                     1              1            0   
2                     0                     1              0            1   
3                     1                     0              0            1   
4                     0                     1              0            1   

   Product line_Electronic accessories  Product line_Fashion a

In [37]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score

# # Splitting the data into training and testing sets
# X = preprocessed_dataset.drop(columns=['Total'])  # Make sure 'total' is replaced with the actual column name of your target variable
# y = preprocessed_dataset['Total']  # Make sure 'total' is replaced with the actual column name of your target variable

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
# # Creating and training the Linear Regression model
# model = LinearRegression()
# model.fit(X_train, y_train)

In [39]:
# # Making predictions on the test set
# y_pred = model.predict(X_test)

In [40]:
# # Evaluating the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

In [41]:
# print("Mean Squared Error:", mse)
# print("R-squared:", r2)


Mean Squared Error: 2.3815458293132676e-26
R-squared: 1.0


In [42]:
# print("Coefficients:", model.coef_)
# print("Intercept:", model.intercept_)

Coefficients: [-9.54402386e-14  1.22881181e+02  1.22881181e+02 -2.32179980e-14
  6.90621075e-15  1.16101677e-14  2.70996719e-14 -2.69886496e-14
 -7.24983453e-15  5.47347769e-15  3.22637670e-15 -4.04975799e-15
 -2.03891801e-15  6.46559647e-16  3.30905186e-15  7.17808289e-16
  1.92234682e-14  6.39109172e-18 -2.21888311e-14]
Intercept: 322.96674899999994


In [43]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# # Create and train the RandomForestRegressor mode
# # rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# # rf_model.fit(X_train, y_train)

# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }




In [44]:
# Create the grid search object
# grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#


In [45]:
# Perform the grid search
# grid_search.fit(X_train, y_train)



In [46]:
# Get the best hyperparameters
# best_params = grid_search.best_params_


# best_params

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [47]:
# y_pred_rf = grid_search.predict(X_test)

In [50]:
# mse_rf = mean_squared_error(y_test, y_pred_rf)
# r2_rf = r2_score(y_test, y_pred_rf)
# mae_rf = mean_absolute_error(y_test, y_pred_rf)

# print("Random Forest Regression Model Performance:")
# print("Mean Squared Error:", mse_rf)
# print("R-squared:", r2_rf)
# print("Mean Absolute Error:", mae_rf)

Random Forest Regression Model Performance:
Mean Squared Error: 1.4852512487137546
R-squared: 0.999977170704991
Mean Absolute Error: 0.7206559500000256


In [52]:
# # Splitting the data into training and testing sets
# X = preprocessed_dataset.drop(columns=['Total'])
# y = preprocessed_dataset['Total']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# # Creating and training the Decision Tree Regressor model with hyperparameters
# decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
# decision_tree_model.fit(X_train, y_train)

In [54]:
# # Making predictions
# y_pred = decision_tree_model.predict(X_test)

In [55]:
# # Evaluating the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Decision Tree Regression Model Performance:")
# print("Mean Squared Error:", mse)
# print("R-squared:", r2)

Decision Tree Regression Model Performance:
Mean Squared Error: 74.53971433403893
R-squared: 0.9988542752413828


In [56]:
# # Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'alpha': [0.01, 0.1, 1, 10, 100]  # Regularization strength
# }


In [57]:
# # Initialize Ridge Regression model
# ridge = Ridge()

In [58]:
# # Initialize Lasso Regression model
# lasso = Lasso()

In [59]:
# # Perform GridSearchCV for Ridge Regression
# ridge_grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
# ridge_grid_search.fit(X_train, y_train)

In [60]:
# # Perform GridSearchCV for Lasso Regression
# lasso_grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
# lasso_grid_search.fit(X_train, y_train)

In [61]:
# # Get the best parameters and best estimator for Ridge Regression
# best_params_ridge = ridge_grid_search.best_params_
# best_estimator_ridge = ridge_grid_search.best_estimator_

In [62]:
# # Get the best parameters and best estimator for Lasso Regression
# best_params_lasso = lasso_grid_search.best_params_
# best_estimator_lasso = lasso_grid_search.best_estimator_

In [63]:
# # Print the best parameters for Ridge Regression
# print("Best Parameters for Ridge Regression:", best_params_ridge)

Best Parameters for Ridge Regression: {'alpha': 0.01}


In [64]:
# # Print the best parameters for Lasso Regression
# print("Best Parameters for Lasso Regression:", best_params_lasso)

Best Parameters for Lasso Regression: {'alpha': 0.01}


In [65]:
# # Predict using the best Ridge Regression model
# y_pred_ridge = best_estimator_ridge.predict(X_test)

In [66]:
# # Calculate R^2 score for Ridge Regression
# r2_ridge = r2_score(y_test, y_pred_ridge)
# print("R^2 score for Ridge Regression:", r2_ridge)

R^2 score for Ridge Regression: 0.9999999999190439


In [67]:
# # Predict using the best Lasso Regression model
# y_pred_lasso = best_estimator_lasso.predict(X_test)

In [68]:
# # Calculate R^2 score for Lasso Regression
# r2_lasso = r2_score(y_test, y_pred_lasso)
# print("R^2 score for Lasso Regression:", r2_lasso)

R^2 score for Lasso Regression: 0.999999996871929


In [69]:
# For Ridge Regression
# ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
# ridge_mse = mean_squared_error(y_test, y_pred_ridge)
# ridge_rmse = np.sqrt(ridge_mse)
# print("Mean Squared Error for Ridge Regression:", ridge_rmse)
# print("Mean Absolute Error for Ridge Regression:", ridge_rmse)

# # For Lasso Regression
# lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
# lasso_mse = mean_squared_error(y_test, y_pred_lasso)
# lasso_rmse = np.sqrt(lasso_mse)
# print("Mean Squared Error for Lasso Regression:", lasso_rmse)
# print("Mean Absolute Error for Lasso Regression:", lasso_rmse)


Mean Squared Error for Ridge Regression: 0.0022949775380543343
Mean Absolute Error for Ridge Regression: 0.0022949775380543343
Mean Squared Error for Lasso Regression: 0.014265664785514968
Mean Absolute Error for Lasso Regression: 0.014265664785514968
