## importing the libraries

In [46]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## importing the datasets

In [47]:

df=pd.read_csv("supermarket_sales.csv")
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,01-05-2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,03-08-2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,03-03-2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,02-08-2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [48]:
df = df.drop(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Tax 5%', 'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating'], axis=1)
df.head()



Unnamed: 0,Unit price,Quantity,Total,Date
0,74.69,7,548.9715,01-05-2019
1,15.28,5,80.22,03-08-2019
2,46.33,7,340.5255,03-03-2019
3,58.22,8,489.048,1/27/2019
4,86.31,7,634.3785,02-08-2019


In [49]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=False)

# Extract year, month, and day features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.head()

Unnamed: 0,Unit price,Quantity,Total,Date,Year,Month,Day
0,74.69,7,548.9715,2019-01-05,2019,1,5
1,15.28,5,80.22,2019-03-08,2019,3,8
2,46.33,7,340.5255,2019-03-03,2019,3,3
3,58.22,8,489.048,2019-01-27,2019,1,27
4,86.31,7,634.3785,2019-02-08,2019,2,8


In [50]:
X = df[['Year', 'Month', 'Day', 'Unit price', 'Quantity']]
y = df['Total']

In [51]:
# Check for missing values
print("Missing values in X:\n", X.isnull().sum())
print("Missing values in y:\n", y.isnull().sum())

Missing values in X:
 Year          0
Month         0
Day           0
Unit price    0
Quantity      0
dtype: int64
Missing values in y:
 0


## Splitting the dataset into the Training set and Test set

In [52]:
from sklearn.model_selection import train_test_split,GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the linear regression model

In [54]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
lr_model = LinearRegression()
lasso_model = Lasso(alpha=0.1)
ridge_model = Ridge(alpha=0.1)

In [55]:

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Training RMSE: {train_rmse:.4f}")
    print(f"Testing RMSE: {test_rmse:.4f}")
    print(f"Training R^2: {train_r2:.4f}")
    print(f"Testing R^2: {test_r2:.4f}")
    
    return {
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2
    }


In [56]:
# Basic Model Training and Evaluation
print("\n=== Basic Models ===")
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.1),
    'Lasso Regression': Lasso(alpha=0.1),
    'ElasticNet Regression': ElasticNet(alpha=0.1, l1_ratio=0.5)
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, name)

# Analyze feature importance for Lasso and Ridge
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Lasso Coefficients': models['Lasso Regression'].coef_,
    'Ridge Coefficients': models['Ridge Regression'].coef_
})
print("\nFeature Importance:")
print(feature_importance)

# Analyze overfitting by comparing training and testing R² scores
print("\nOverfitting Analysis:")
for name, result in results.items():
    print(f"{name} - Train R²: {result['train_r2']:.4f}, Test R²: {result['test_r2']:.4f}")


=== Basic Models ===

Linear Regression Results:
Training RMSE: 82.0908
Testing RMSE: 78.9438
Training R^2: 0.8861
Testing R^2: 0.9042

Ridge Regression Results:
Training RMSE: 82.0908
Testing RMSE: 78.9466
Training R^2: 0.8861
Testing R^2: 0.9042

Lasso Regression Results:
Training RMSE: 82.0910
Testing RMSE: 78.9584
Training R^2: 0.8861
Testing R^2: 0.9042

ElasticNet Regression Results:
Training RMSE: 82.8305
Testing RMSE: 80.7949
Training R^2: 0.8840
Testing R^2: 0.8997

Feature Importance:
      Feature  Lasso Coefficients  Ridge Coefficients
0        Year            0.000000            0.000000
1       Month            0.151369            0.246061
2         Day           -2.330910           -2.420486
3  Unit price          152.256191          152.343261
4    Quantity          171.824516          171.902668

Overfitting Analysis:
Linear Regression - Train R²: 0.8861, Test R²: 0.9042
Ridge Regression - Train R²: 0.8861, Test R²: 0.9042
Lasso Regression - Train R²: 0.8861, Test R²: