In [1]:
# Import standard libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
# Move up one level to data, then access raw
file_path = '../data/raw/retail_sales_dataset.csv'

# Load the CSV file
retail_data = pd.read_csv(file_path)

# Display the first few rows of the data
print(retail_data.head())

   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  


In [3]:
# Convert 'date' to datetime format
retail_data['Date'] = pd.to_datetime(retail_data['Date'])

In [4]:
# Extract date features (month, day of the week, etc.)
retail_data['month'] = retail_data['Date'].dt.month
retail_data['day_of_week'] = retail_data['Date'].dt.dayofweek
retail_data['year'] = retail_data['Date'].dt.year

print(retail_data.head())

   Transaction ID       Date Customer ID  Gender  Age Product Category  \
0               1 2023-11-24     CUST001    Male   34           Beauty   
1               2 2023-02-27     CUST002  Female   26         Clothing   
2               3 2023-01-13     CUST003    Male   50      Electronics   
3               4 2023-05-21     CUST004    Male   37         Clothing   
4               5 2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  month  day_of_week  year  
0         3              50           150     11            4  2023  
1         2             500          1000      2            0  2023  
2         1              30            30      1            4  2023  
3         1             500           500      5            6  2023  
4         2              50           100      5            5  2023  


In [9]:
# Group by month to get total sales per month
retail_data_monthly_sales = retail_data.groupby(['year', 'month'])['Total Amount'].sum().reset_index()

# Sort by 'total amount' in descending order
retail_data_monthly_sales = retail_data_monthly_sales.sort_values(by='Total Amount', ascending=False)

# Reset the index for better readability
retail_data_monthly_sales.reset_index(drop=True, inplace=True)

# Display the sorted DataFrame
print(retail_data_monthly_sales)

    year  month  Total Amount
0   2023      5         53150
1   2023     10         46580
2   2023     12         44690
3   2023      2         44060
4   2023      8         36960
5   2023      6         36715
6   2023      7         35465
7   2023      1         35450
8   2023     11         34920
9   2023      4         33870
10  2023      3         28990
11  2023      9         23620
12  2024      1          1530
