In [14]:
import pandas as pd

data = pd.read_csv("retail_sales_dataset.csv")

data

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100
...,...,...,...,...,...,...,...,...,...
995,996,2023-05-16,CUST996,Male,62,Clothing,1,50,50
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150


In [18]:
data1 = data[["Price per Unit","Quantity","Age","Gender","Product Category","Total Amount"]]

data1

Unnamed: 0,Price per Unit,Quantity,Age,Gender,Product Category,Total Amount
0,50,3,34,Male,Beauty,150
1,500,2,26,Female,Clothing,1000
2,30,1,50,Male,Electronics,30
3,500,1,37,Male,Clothing,500
4,50,2,30,Male,Beauty,100
...,...,...,...,...,...,...
995,50,1,62,Male,Clothing,50
996,30,3,52,Male,Beauty,90
997,25,4,23,Female,Beauty,100
998,50,3,36,Female,Electronics,150


In [20]:
# one hot encoding for Product cat

data2 = pd.get_dummies(data1, columns=["Product Category"], drop_first=True)

data2


Unnamed: 0,Price per Unit,Quantity,Age,Gender,Total Amount,Product Category_Clothing,Product Category_Electronics
0,50,3,34,Male,150,False,False
1,500,2,26,Female,1000,True,False
2,30,1,50,Male,30,False,True
3,500,1,37,Male,500,True,False
4,50,2,30,Male,100,False,False
...,...,...,...,...,...,...,...
995,50,1,62,Male,50,True,False
996,30,3,52,Male,90,False,False
997,25,4,23,Female,100,False,False
998,50,3,36,Female,150,False,True


In [22]:
# Lable encoading for Gender

data2["Gender"] = data2["Gender"].map({"Male": 0, "Female": 1})

data2

Unnamed: 0,Price per Unit,Quantity,Age,Gender,Total Amount,Product Category_Clothing,Product Category_Electronics
0,50,3,34,0,150,False,False
1,500,2,26,1,1000,True,False
2,30,1,50,0,30,False,True
3,500,1,37,0,500,True,False
4,50,2,30,0,100,False,False
...,...,...,...,...,...,...,...
995,50,1,62,0,50,True,False
996,30,3,52,0,90,False,False
997,25,4,23,1,100,False,False
998,50,3,36,1,150,False,True


In [24]:
data2.head()

Unnamed: 0,Price per Unit,Quantity,Age,Gender,Total Amount,Product Category_Clothing,Product Category_Electronics
0,50,3,34,0,150,False,False
1,500,2,26,1,1000,True,False
2,30,1,50,0,30,False,True
3,500,1,37,0,500,True,False
4,50,2,30,0,100,False,False


In [26]:
data2.isnull().sum()

Price per Unit                  0
Quantity                        0
Age                             0
Gender                          0
Total Amount                    0
Product Category_Clothing       0
Product Category_Electronics    0
dtype: int64

In [28]:
# outlier

Q1 = data2[["Price per Unit","Quantity","Age"]].quantile(0.25)

Q3 = data2[["Price per Unit","Quantity","Age"]].quantile(0.75)

IQR = Q3 - Q1

IQR


Price per Unit    270.0
Quantity            3.0
Age                24.0
dtype: float64

In [30]:
# Identify outliers

Outliers = ((data2[["Price per Unit","Quantity","Age"]] < (Q1 -  1.5 * IQR)) | (data2[["Price per Unit","Quantity","Age"]] > (Q3 + 1.5 * IQR)))

Outliers

Unnamed: 0,Price per Unit,Quantity,Age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
995,False,False,False
996,False,False,False
997,False,False,False
998,False,False,False


In [32]:
data3 = data2[~Outliers.any(axis=1)]

data3

Unnamed: 0,Price per Unit,Quantity,Age,Gender,Total Amount,Product Category_Clothing,Product Category_Electronics
0,50,3,34,0,150,False,False
1,500,2,26,1,1000,True,False
2,30,1,50,0,30,False,True
3,500,1,37,0,500,True,False
4,50,2,30,0,100,False,False
...,...,...,...,...,...,...,...
995,50,1,62,0,50,True,False
996,30,3,52,0,90,False,False
997,25,4,23,1,100,False,False
998,50,3,36,1,150,False,True


In [34]:
print(data3.info())
print(data3.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   Price per Unit                1000 non-null   int64
 1   Quantity                      1000 non-null   int64
 2   Age                           1000 non-null   int64
 3   Gender                        1000 non-null   int64
 4   Total Amount                  1000 non-null   int64
 5   Product Category_Clothing     1000 non-null   bool 
 6   Product Category_Electronics  1000 non-null   bool 
dtypes: bool(2), int64(5)
memory usage: 41.1 KB
None
   Price per Unit  Quantity  Age  Gender  Total Amount  \
0              50         3   34       0           150   
1             500         2   26       1          1000   
2              30         1   50       0            30   
3             500         1   37       0           500   
4              50         2   30       

In [44]:
from sklearn.model_selection import train_test_split

In [42]:
x = data3.drop('Total Amount', axis=1)
y = data3['Total Amount']

x,y

(     Price per Unit  Quantity  Age  Gender  Product Category_Clothing  \
 0                50         3   34       0                      False   
 1               500         2   26       1                       True   
 2                30         1   50       0                      False   
 3               500         1   37       0                       True   
 4                50         2   30       0                      False   
 ..              ...       ...  ...     ...                        ...   
 995              50         1   62       0                       True   
 996              30         3   52       0                      False   
 997              25         4   23       1                      False   
 998              50         3   36       1                      False   
 999              30         4   47       0                      False   
 
      Product Category_Electronics  
 0                           False  
 1                           False  

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train, x_test, y_train, y_test

(     Price per Unit  Quantity  Age  Gender  Product Category_Clothing  \
 29              300         3   39       1                      False   
 535              30         4   55       1                      False   
 695              50         4   50       1                       True   
 557              25         1   41       1                       True   
 836              30         3   18       0                      False   
 ..              ...       ...  ...     ...                        ...   
 106             300         4   21       1                       True   
 270              30         4   62       1                      False   
 860              30         3   41       1                       True   
 435              30         4   57       1                       True   
 102              25         1   59       1                       True   
 
      Product Category_Electronics  
 29                          False  
 535                         False  

In [50]:
print(f"Training set size: {x_train.shape[0]} samples")

Training set size: 800 samples


In [52]:
print(f"Testing set size: {x_test.shape[0]} samples")

Testing set size: 200 samples


In [54]:
print(f"Total number of samples: {data3.shape[0]}")

Total number of samples: 1000


In [144]:
x_test

Unnamed: 0,Price per Unit,Quantity,Age,Gender,Product Category_Clothing,Product Category_Electronics
521,500,3,46,0,False,False
737,50,2,41,0,True,False
740,300,1,48,0,True,False
660,25,4,44,1,True,False
411,500,4,19,1,False,True
...,...,...,...,...,...,...
408,300,3,21,1,False,True
332,300,4,54,1,False,True
208,50,4,30,1,False,True
613,300,4,39,1,False,False


In [152]:
y_test


521    1500
737     100
740     300
660     100
411    2000
       ... 
408     900
332    1200
208     200
613    1200
78      300
Name: Total Amount, Length: 200, dtype: int64

In [146]:
# training the model

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RFSF_model = RandomForestRegressor(n_estimators=100, random_state=42)

RFSF_model


In [148]:
RFSF_model.fit(x_train, y_train)

result1 = RFSF_model.predict(x_test)

result1



array([1500.,  100.,  300.,  100., 2000.,   90.,   50.,  300.,  200.,
       1000.,   75.,  100.,  600.,   30.,  300.,   60., 2000.,   60.,
         50., 1200.,  900.,  100., 1500.,   60., 1500.,   25.,   50.,
        900.,  200.,   75.,   25.,   30.,   75.,  100.,   25., 1500.,
        100.,   25.,   50.,  150.,  100., 1000., 1500.,  200.,  200.,
        100.,  300.,  300.,  600.,  600.,   50., 1000.,  120.,   30.,
        300., 1200.,   50.,  100.,  120.,  300., 1000.,  120.,  600.,
        200., 1000., 1500., 1200.,  500.,  100.,   25.,   75.,  200.,
        100., 2000.,   90.,  600.,   75.,   50.,  900., 1200.,  900.,
       1200.,  100.,  300.,  500.,  100.,  150.,  100.,  100.,  100.,
         90., 1500.,  300.,   90.,  900., 1200.,  200., 2000.,  100.,
        900.,  500., 1000.,  500., 2000., 1500.,  120.,  300.,   60.,
       1500.,   90.,  150.,  120., 1000.,  200.,  100.,  200.,   25.,
         25.,   90.,  900.,   50.,  150.,  500.,   50.,  500.,   50.,
        200.,   50.,

In [150]:
# evaluation

#MAE

mae = mean_absolute_error(y_test, result1)

#MSE

mse = mean_squared_error(y_test, result1)

# R2

r2 = mse ** 0.5


print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2: {r2}")

MAE: 0.0
MSE: 0.0
R2: 0.0


In [132]:
predictions_df = pd.DataFrame({'Actual': y_test,'Predicted': result1})

print(predictions_df.head())

     Actual  Predicted
521    1500     1500.0
737     100      100.0
740     300      300.0
660     100      100.0
411    2000     2000.0
