## Sales Forecasting


### Reg No: IT21127946
### Name: Christy H.M

<hr/>

<ul>
    <li><b>Target Variable:</b> Total sales (considering the Total attribute)</li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, Payment, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the total sales amount based on various factors such as the type of products, customer demographics, and purchase details.</li>
</ul>




In [121]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso

In [122]:
dataset = pd.read_csv(
    '../dataset/supermarket_sales.csv')

In [123]:
dataset

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.8200,80.2200,3/8/2019,10:29,Cash,76.40,4.761905,3.8200,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.2880,489.0480,1/27/2019,20:33,Ewallet,465.76,4.761905,23.2880,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,233-67-5758,C,Naypyitaw,Normal,Male,Health and beauty,40.35,1,2.0175,42.3675,1/29/2019,13:46,Ewallet,40.35,4.761905,2.0175,6.2
996,303-96-2227,B,Mandalay,Normal,Female,Home and lifestyle,97.38,10,48.6900,1022.4900,3/2/2019,17:16,Ewallet,973.80,4.761905,48.6900,4.4
997,727-02-1313,A,Yangon,Member,Male,Food and beverages,31.84,1,1.5920,33.4320,2/9/2019,13:22,Cash,31.84,4.761905,1.5920,7.7
998,347-56-2442,A,Yangon,Normal,Male,Home and lifestyle,65.82,1,3.2910,69.1110,2/22/2019,15:33,Cash,65.82,4.761905,3.2910,4.1


In [124]:
# Selecting relevant columns
selected_columns = ['Branch', 'Customer type', 'Gender', 'Product line', 'Quantity', 
                    'Payment', 'cogs', 'gross income','Total']

In [125]:
# Creating a new DataFrame with selected columns
preprocessed_dataset = dataset[selected_columns].copy()

In [126]:
#Checking preprocessed dataframe
preprocessed_dataset

Unnamed: 0,Branch,Customer type,Gender,Product line,Quantity,Payment,cogs,gross income,Total
0,A,Member,Female,Health and beauty,7,Ewallet,522.83,26.1415,548.9715
1,C,Normal,Female,Electronic accessories,5,Cash,76.40,3.8200,80.2200
2,A,Normal,Male,Home and lifestyle,7,Credit card,324.31,16.2155,340.5255
3,A,Member,Male,Health and beauty,8,Ewallet,465.76,23.2880,489.0480
4,A,Normal,Male,Sports and travel,7,Ewallet,604.17,30.2085,634.3785
...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,1,Ewallet,40.35,2.0175,42.3675
996,B,Normal,Female,Home and lifestyle,10,Ewallet,973.80,48.6900,1022.4900
997,A,Member,Male,Food and beverages,1,Cash,31.84,1.5920,33.4320
998,A,Normal,Male,Home and lifestyle,1,Cash,65.82,3.2910,69.1110


In [127]:
# Handling missing values (if any)
preprocessed_dataset.dropna(inplace=True) 

In [128]:
#Preprocessed data
preprocessed_dataset

Unnamed: 0,Branch,Customer type,Gender,Product line,Quantity,Payment,cogs,gross income,Total
0,A,Member,Female,Health and beauty,7,Ewallet,522.83,26.1415,548.9715
1,C,Normal,Female,Electronic accessories,5,Cash,76.40,3.8200,80.2200
2,A,Normal,Male,Home and lifestyle,7,Credit card,324.31,16.2155,340.5255
3,A,Member,Male,Health and beauty,8,Ewallet,465.76,23.2880,489.0480
4,A,Normal,Male,Sports and travel,7,Ewallet,604.17,30.2085,634.3785
...,...,...,...,...,...,...,...,...,...
995,C,Normal,Male,Health and beauty,1,Ewallet,40.35,2.0175,42.3675
996,B,Normal,Female,Home and lifestyle,10,Ewallet,973.80,48.6900,1022.4900
997,A,Member,Male,Food and beverages,1,Cash,31.84,1.5920,33.4320
998,A,Normal,Male,Home and lifestyle,1,Cash,65.82,3.2910,69.1110


In [129]:
# Encoding categorical variables using one-hot encoding
preprocessed_dataset = pd.get_dummies(preprocessed_dataset, columns=['Branch', 'Customer type', 'Gender', 'Product line', 'Payment'])

In [130]:
# Scaling numerical features (optional, depending on the model you choose)

scaler = StandardScaler()
preprocessed_dataset[['Quantity', 'cogs', 'gross income']] = scaler.fit_transform(preprocessed_dataset[['Quantity', 'cogs', 'gross income']])

In [131]:
# Convert boolean columns to 0 and 1
boolean_columns = ['Branch_A', 'Branch_B', 'Branch_C',
                   'Customer type_Member', 'Customer type_Normal',
                   'Gender_Female', 'Gender_Male',
                   'Product line_Electronic accessories', 'Product line_Fashion accessories',
                   'Product line_Food and beverages', 'Product line_Health and beauty',
                   'Product line_Home and lifestyle', 'Product line_Sports and travel',
                   'Payment_Cash', 'Payment_Credit card', 'Payment_Ewallet']

preprocessed_dataset[boolean_columns] = preprocessed_dataset[boolean_columns].astype(int)

In [132]:
# Displaying the preprocessed dataset
print(preprocessed_dataset.head())

   Quantity      cogs  gross income     Total  Branch_A  Branch_B  Branch_C  \
0  0.509930  0.919607      0.919607  548.9715         1         0         0   
1 -0.174540 -0.987730     -0.987730   80.2200         0         0         1   
2  0.509930  0.071446      0.071446  340.5255         1         0         0   
3  0.852165  0.675780      0.675780  489.0480         1         0         0   
4  0.509930  1.267125      1.267125  634.3785         1         0         0   

   Customer type_Member  Customer type_Normal  Gender_Female  Gender_Male  \
0                     1                     0              1            0   
1                     0                     1              1            0   
2                     0                     1              0            1   
3                     1                     0              0            1   
4                     0                     1              0            1   

   Product line_Electronic accessories  Product line_Fashion a

In [133]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Splitting the data into training and testing sets
X = preprocessed_dataset.drop(columns=['Total'])  # Make sure 'total' is replaced with the actual column name of your target variable
y = preprocessed_dataset['Total']  # Make sure 'total' is replaced with the actual column name of your target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [134]:
# Creating and training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [136]:
# Making predictions on the test set
y_pred = model.predict(X_test)

In [140]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [141]:
print("Mean Squared Error:", mse)
print("R-squared:", r2)


In [142]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [145]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create and train the RandomForestRegressor mode
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Create the grid search object
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_


best_params

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [146]:
y_pred_rf = grid_search.predict(X_test)

In [147]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Regression Model Performance:")
print("Mean Squared Error:", mse_rf)
print("R-squared:", r2_rf)
print("Mean Absolute Error:", mae_rf)

Random Forest Regression Model Performance:
Mean Squared Error: 1.4852512487137546
R-squared: 0.999977170704991
Mean Absolute Error: 0.7206559500000256


In [148]:
# Splitting the data into training and testing sets
X = preprocessed_dataset.drop(columns=['Total'])
y = preprocessed_dataset['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [149]:
# Creating and training the Decision Tree Regressor model with hyperparameters
decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
decision_tree_model.fit(X_train, y_train)

In [150]:
# Making predictions
y_pred = decision_tree_model.predict(X_test)

In [151]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Decision Tree Regression Model Performance:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Decision Tree Regression Model Performance:
Mean Squared Error: 74.53971433403893
R-squared: 0.9988542752413828


In [152]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # Regularization strength
}

# Initialize Ridge Regression model
ridge = Ridge()

# Initialize Lasso Regression model
lasso = Lasso()

# Perform GridSearchCV for Ridge Regression
ridge_grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid_search.fit(X_train, y_train)

# Perform GridSearchCV for Lasso Regression
lasso_grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator for Ridge Regression
best_params_ridge = ridge_grid_search.best_params_
best_estimator_ridge = ridge_grid_search.best_estimator_

# Get the best parameters and best estimator for Lasso Regression
best_params_lasso = lasso_grid_search.best_params_
best_estimator_lasso = lasso_grid_search.best_estimator_

# Print the best parameters for Ridge Regression
print("Best Parameters for Ridge Regression:", best_params_ridge)

# Print the best parameters for Lasso Regression
print("Best Parameters for Lasso Regression:", best_params_lasso)


Best Parameters for Ridge Regression: {'alpha': 0.01}
Best Parameters for Lasso Regression: {'alpha': 0.01}


In [153]:
# Predict using the best Ridge Regression model
y_pred_ridge = best_estimator_ridge.predict(X_test)

# Calculate R^2 score for Ridge Regression
r2_ridge = r2_score(y_test, y_pred_ridge)
print("R^2 score for Ridge Regression:", r2_ridge)

# Predict using the best Lasso Regression model
y_pred_lasso = best_estimator_lasso.predict(X_test)

# Calculate R^2 score for Lasso Regression
r2_lasso = r2_score(y_test, y_pred_lasso)
print("R^2 score for Lasso Regression:", r2_lasso)

R^2 score for Ridge Regression: 0.9999999999190439
R^2 score for Lasso Regression: 0.999999996871929


In [160]:
# For Ridge Regression
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(ridge_mse)
print("Mean Squared Error for Ridge Regression:", ridge_rmse)
print("Mean Absolute Error for Ridge Regression:", ridge_rmse)

# For Lasso Regression
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_rmse = np.sqrt(lasso_mse)
print("Mean Squared Error for Lasso Regression:", lasso_rmse)
print("Mean Absolute Error for Lasso Regression:", lasso_rmse)


Mean Squared Error for Ridge Regression: 0.0022949775380543343
Mean Absolute Error for Ridge Regression: 0.0022949775380543343
Mean Squared Error for Lasso Regression: 0.014265664785514968
Mean Absolute Error for Lasso Regression: 0.014265664785514968
