In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [32]:
dataset = pd.read_csv("retail_sales_dataset.csv")
dataset.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [33]:
dataset.isna().sum()

Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64

In [34]:
dataset.dtypes

Transaction ID       int64
Date                object
Customer ID         object
Gender              object
Age                  int64
Product Category    object
Quantity             int64
Price per Unit       int64
Total Amount         int64
dtype: object

In [35]:
modified_dataset = dataset.apply(pd.to_numeric, errors='coerce').astype("float")
modified_dataset

modified_dataset.dtypes

Transaction ID      float64
Date                float64
Customer ID         float64
Gender              float64
Age                 float64
Product Category    float64
Quantity            float64
Price per Unit      float64
Total Amount        float64
dtype: object

In [36]:
modified_dataset

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1.0,,,,34.0,,3.0,50.0,150.0
1,2.0,,,,26.0,,2.0,500.0,1000.0
2,3.0,,,,50.0,,1.0,30.0,30.0
3,4.0,,,,37.0,,1.0,500.0,500.0
4,5.0,,,,30.0,,2.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...
995,996.0,,,,62.0,,1.0,50.0,50.0
996,997.0,,,,52.0,,3.0,30.0,90.0
997,998.0,,,,23.0,,4.0,25.0,100.0
998,999.0,,,,36.0,,3.0,50.0,150.0


In [37]:
modified_dataset.fillna(value=0, inplace=True)
modified_dataset

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1.0,0.0,0.0,0.0,34.0,0.0,3.0,50.0,150.0
1,2.0,0.0,0.0,0.0,26.0,0.0,2.0,500.0,1000.0
2,3.0,0.0,0.0,0.0,50.0,0.0,1.0,30.0,30.0
3,4.0,0.0,0.0,0.0,37.0,0.0,1.0,500.0,500.0
4,5.0,0.0,0.0,0.0,30.0,0.0,2.0,50.0,100.0
...,...,...,...,...,...,...,...,...,...
995,996.0,0.0,0.0,0.0,62.0,0.0,1.0,50.0,50.0
996,997.0,0.0,0.0,0.0,52.0,0.0,3.0,30.0,90.0
997,998.0,0.0,0.0,0.0,23.0,0.0,4.0,25.0,100.0
998,999.0,0.0,0.0,0.0,36.0,0.0,3.0,50.0,150.0


In [38]:
X = modified_dataset.drop(columns="Total Amount")
X

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit
0,1.0,0.0,0.0,0.0,34.0,0.0,3.0,50.0
1,2.0,0.0,0.0,0.0,26.0,0.0,2.0,500.0
2,3.0,0.0,0.0,0.0,50.0,0.0,1.0,30.0
3,4.0,0.0,0.0,0.0,37.0,0.0,1.0,500.0
4,5.0,0.0,0.0,0.0,30.0,0.0,2.0,50.0
...,...,...,...,...,...,...,...,...
995,996.0,0.0,0.0,0.0,62.0,0.0,1.0,50.0
996,997.0,0.0,0.0,0.0,52.0,0.0,3.0,30.0
997,998.0,0.0,0.0,0.0,23.0,0.0,4.0,25.0
998,999.0,0.0,0.0,0.0,36.0,0.0,3.0,50.0


In [39]:
y = modified_dataset["Total Amount"]
y

0       150.0
1      1000.0
2        30.0
3       500.0
4       100.0
        ...  
995      50.0
996      90.0
997     100.0
998     150.0
999     120.0
Name: Total Amount, Length: 1000, dtype: float64

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [41]:
model = LinearRegression()

In [42]:
model.fit(X_train, y_train)

In [43]:
model_predictions = model.predict(X_test)

In [44]:
r2_score = r2_score(y_test, model_predictions)
r2_score

0.8600704946025333