In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("100_sales.csv")

In [3]:
df

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Ship_Date,Unit_Cost,Total_Revenue,Total_Profit,Unnamed: 9,Unnamed: 10
0,Australia and Oceania,Tuvalu,Baby Food,Offline,H,27/06/2010,159.42,2533654.00,951410.50,,
1,Central America and the Caribbean,Grenada,Cereal,Online,C,15/09/2012,117.11,576782.80,248406.36,,
2,Europe,Russia,Office Supplies,Offline,L,05/08/2014,524.96,1158502.59,224598.75,,
3,Sub_Saharan Africa,Sao Tome and Principe,Fruits,Online,C,07/05/2014,6.92,75591.66,19525.82,,
4,Sub_Saharan Africa,Rwanda,Office Supplies,Offline,L,02/06/2013,524.96,3296425.02,639077.50,,
...,...,...,...,...,...,...,...,...,...,...,...
95,Sub_Saharan Africa,Mali,Clothes,Online,M,09/03/2011,35.84,97040.64,65214.72,,
96,Asia,Malaysia,Fruits,Offline,L,28/12/2011,6.92,58471.11,15103.47,,
97,Sub_Saharan Africa,Sierra Leone,Vegetables,Offline,C,29/06/2016,90.93,228779.10,93748.05,,
98,North America,Mexico,Personal Care,Offline,M,08/08/2015,56.67,471336.91,144521.02,,


In [4]:
df.isnull().sum()

Region              0
Country             0
Item_Type           0
Sales_Channel       0
Order_Priority      0
Ship_Date           0
Unit_Cost           0
Total_Revenue       0
Total_Profit        0
Unnamed: 9        100
Unnamed: 10       100
dtype: int64

In [5]:
df = df.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

In [6]:
df = df.dropna()

In [7]:
X = df.drop(columns=['Total_Revenue'])

In [8]:
y = df['Total_Revenue']

In [9]:
y

0     2533654.00
1      576782.80
2     1158502.59
3       75591.66
4     3296425.02
         ...    
95      97040.64
96      58471.11
97     228779.10
98     471336.91
99    3586605.09
Name: Total_Revenue, Length: 100, dtype: float64

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [11]:
X = pd.get_dummies(X, drop_first=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
alphas = [0.111] + list(range(1, 13))
results = []

In [16]:

for alpha in alphas:
    model = Lasso(alpha=alpha, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    results.append({
        'Alpha': alpha,
        'MSE': mse,
        'MAE': mae,
        'RMSE': rmse
    })

In [17]:
results_df = pd.DataFrame(results)

In [18]:
results_df = results_df.sort_values(by='RMSE')

In [20]:
results_df

Unnamed: 0,Alpha,MSE,MAE,RMSE
11,11.0,132268200000.0,231921.435307,363686.978132
10,10.0,132571600000.0,231486.793915,364103.881919
12,12.0,134019400000.0,235150.331151,366086.644157
9,9.0,135989800000.0,230547.054873,368767.892708
8,8.0,145193400000.0,230589.02073,381042.539744
7,7.0,145430100000.0,230649.072793,381353.037459
6,6.0,145452000000.0,230665.300652,381381.636225
5,5.0,145474900000.0,230680.672523,381411.782612
4,4.0,249031000000.0,313282.305972,499030.078533
3,3.0,350761500000.0,395855.204954,592251.255652


In [21]:
best_alpha = results_df.iloc[0]['Alpha']

In [24]:
print(f" Best alpha (lowest RMSE): {best_alpha}")

 Best alpha (lowest RMSE): 11.0
