## Sales Forecasting


### Reg No: IT21127946
### Name: Christy H.M

<hr/>

<ul>
    <li><b>Target Variable:</b> Total sales (considering the Total attribute)</li>
    <li><b>Predictors:</b> Branch, Customer type, Gender, Product line, Quantity, Date, Time, Payment, COGS, and Gross income</li>
    <li><b>Objective:</b> Predict the total sales amount based on various factors such as the type of products, customer demographics, and purchase details.</li>
</ul>




In [161]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso

In [162]:
dataset = pd.read_csv(
    '../dataset/supermarket_sales.csv')

In [None]:
dataset

In [164]:
# Selecting relevant columns
selected_columns = ['Branch', 'Customer type', 'Gender', 'Product line', 'Quantity', 
                    'Payment', 'cogs', 'gross income','Total']

In [165]:
# Creating a new DataFrame with selected columns
preprocessed_dataset = dataset[selected_columns].copy()

In [None]:
#Checking preprocessed dataframe
preprocessed_dataset

In [167]:
# Handling missing values (if any)
preprocessed_dataset.dropna(inplace=True) 

In [None]:
#Preprocessed data
preprocessed_dataset

In [169]:
# Encoding categorical variables using one-hot encoding
preprocessed_dataset = pd.get_dummies(preprocessed_dataset, columns=['Branch', 'Customer type', 'Gender', 'Product line', 'Payment'])

In [170]:
# Scaling numerical features (optional, depending on the model you choose)

scaler = StandardScaler()
preprocessed_dataset[['Quantity', 'cogs', 'gross income']] = scaler.fit_transform(preprocessed_dataset[['Quantity', 'cogs', 'gross income']])

In [171]:
# Convert boolean columns to 0 and 1
boolean_columns = ['Branch_A', 'Branch_B', 'Branch_C',
                   'Customer type_Member', 'Customer type_Normal',
                   'Gender_Female', 'Gender_Male',
                   'Product line_Electronic accessories', 'Product line_Fashion accessories',
                   'Product line_Food and beverages', 'Product line_Health and beauty',
                   'Product line_Home and lifestyle', 'Product line_Sports and travel',
                   'Payment_Cash', 'Payment_Credit card', 'Payment_Ewallet']

preprocessed_dataset[boolean_columns] = preprocessed_dataset[boolean_columns].astype(int)

In [None]:
# Displaying the preprocessed dataset
print(preprocessed_dataset.head())

In [173]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Splitting the data into training and testing sets
X = preprocessed_dataset.drop(columns=['Total'])  # Make sure 'total' is replaced with the actual column name of your target variable
y = preprocessed_dataset['Total']  # Make sure 'total' is replaced with the actual column name of your target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [174]:
# Creating and training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [175]:
# Making predictions on the test set
y_pred = model.predict(X_test)

In [176]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print("Mean Squared Error:", mse)
print("R-squared:", r2)


In [None]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Create and train the RandomForestRegressor mode
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}




In [None]:
# Create the grid search object
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)



In [None]:
# Perform the grid search
grid_search.fit(X_train, y_train)



In [None]:
# Get the best hyperparameters
best_params = grid_search.best_params_


best_params

In [180]:
y_pred_rf = grid_search.predict(X_test)

In [None]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Regression Model Performance:")
print("Mean Squared Error:", mse_rf)
print("R-squared:", r2_rf)
print("Mean Absolute Error:", mae_rf)

In [182]:
# Splitting the data into training and testing sets
X = preprocessed_dataset.drop(columns=['Total'])
y = preprocessed_dataset['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [183]:
# Creating and training the Decision Tree Regressor model with hyperparameters
decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=42)
decision_tree_model.fit(X_train, y_train)

In [184]:
# Making predictions
y_pred = decision_tree_model.predict(X_test)

In [None]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Decision Tree Regression Model Performance:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # Regularization strength
}


In [None]:
# Initialize Ridge Regression model
ridge = Ridge()

In [None]:
# Initialize Lasso Regression model
lasso = Lasso()

In [None]:
# Perform GridSearchCV for Ridge Regression
ridge_grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid_search.fit(X_train, y_train)

In [None]:
# Perform GridSearchCV for Lasso Regression
lasso_grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_grid_search.fit(X_train, y_train)

In [None]:
# Get the best parameters and best estimator for Ridge Regression
best_params_ridge = ridge_grid_search.best_params_
best_estimator_ridge = ridge_grid_search.best_estimator_

In [None]:
# Get the best parameters and best estimator for Lasso Regression
best_params_lasso = lasso_grid_search.best_params_
best_estimator_lasso = lasso_grid_search.best_estimator_

In [None]:
# Print the best parameters for Ridge Regression
print("Best Parameters for Ridge Regression:", best_params_ridge)

In [None]:
# Print the best parameters for Lasso Regression
print("Best Parameters for Lasso Regression:", best_params_lasso)

In [None]:
# Predict using the best Ridge Regression model
y_pred_ridge = best_estimator_ridge.predict(X_test)

In [None]:
# Calculate R^2 score for Ridge Regression
r2_ridge = r2_score(y_test, y_pred_ridge)
print("R^2 score for Ridge Regression:", r2_ridge)

In [None]:
# Predict using the best Lasso Regression model
y_pred_lasso = best_estimator_lasso.predict(X_test)

In [None]:
# Calculate R^2 score for Lasso Regression
r2_lasso = r2_score(y_test, y_pred_lasso)
print("R^2 score for Lasso Regression:", r2_lasso)

In [None]:
# For Ridge Regression
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_rmse = np.sqrt(ridge_mse)
print("Mean Squared Error for Ridge Regression:", ridge_rmse)
print("Mean Absolute Error for Ridge Regression:", ridge_rmse)

# For Lasso Regression
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_rmse = np.sqrt(lasso_mse)
print("Mean Squared Error for Lasso Regression:", lasso_rmse)
print("Mean Absolute Error for Lasso Regression:", lasso_rmse)
