In [20]:
import pandas as pd

# Load the CSV files into DataFrames
df_sales = pd.read_csv('data/sales.csv')
df_products = pd.read_csv('data/products.csv')
df_orders = pd.read_csv('data/orders.csv')
df_customers = pd.read_csv('data/customers.csv')

# standardize all ids to zero indexing
df_sales["order_id"] = df_sales["order_id"]-1
df_orders["order_id"] = df_orders["order_id"]-1
df_orders["customer_id"] = df_orders["customer_id"]-1
df_customers["customer_id"] = df_customers["customer_id"]-1

df_products.rename(columns={'product_ID': 'product_id'}, inplace=True)

# print(df_sales.head())
# print(df_sales.shape)
# print(df_products.head())
# print(df_products.shape)
# print(df_orders.head())
# print(df_orders.shape)
# print(df_customers.head())
# print(df_customers.shape)

df_merged = pd.merge(df_sales, df_products, on='product_id', how='inner')
df_merged = pd.merge(df_merged, df_orders, on='order_id', how='inner')
df_merged = pd.merge(df_merged, df_customers, on='customer_id', how='inner')


print(df_merged.head())
df_merged.to_csv('data/merged_data_raw.csv', index=False)
print(df_merged.shape)

   sales_id  order_id  product_id  price_per_unit  quantity_x  total_price  \
0         0         0         218             106           2          212   
1         1         0         481             118           1          118   
2         2         0           2              96           3          288   
3         3         0        1002             106           2          212   
4         4         0         691             113           3          339   

  product_type  product_name size  colour  ...  order_date  delivery_date  \
0        Shirt      Chambray    L  orange  ...   2021-8-30     2021-09-24   
1       Jacket        Puffer    S  indigo  ...   2021-8-30     2021-09-24   
2        Shirt  Oxford Cloth    M     red  ...   2021-8-30     2021-09-24   
3     Trousers          Wool    M    blue  ...   2021-8-30     2021-09-24   
4       Jacket         Parka    S  indigo  ...   2021-8-30     2021-09-24   

      customer_name       gender  age               home_address zip

In [None]:
# Check for missing values
# print(df_merged.isnull().sum())

# Check data types
# print(df_merged.dtypes)

# data cleaning
df_merged = df_merged.drop_duplicates()

# Convert 'order_date' and 'delivery_date' to datetime
df_merged['order_date'] = pd.to_datetime(df_merged['order_date'])
df_merged['delivery_date'] = pd.to_datetime(df_merged['delivery_date'])

# Example of a new feature: Days to deliver
df_merged['days_to_deliver'] = (df_merged['delivery_date'] - df_merged['order_date']).dt.days

# Display the adjusted DataFrame
print(df_merged.head())
df_merged.to_csv('data/merged_data.csv', index=False)



In [None]:
numerical_vars = df_merged[['price_per_unit', 'quantity_x', 'total_price', 'payment', 'age', 'days_to_deliver']]

# Calculate correlation matrix
correlation_matrix = numerical_vars.corr()

print("Correlation Matrix:")
print(correlation_matrix)

# todo: non numerical variables

import seaborn as sns
import matplotlib.pyplot as plt

# Plot heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()



In [16]:

import scipy.stats as stats
from itertools import combinations

# Load your data
df = pd.read_csv('data/merged_data.csv')

# some transformation ideas
# Convert date to datetime and extract month
if 'order_date' in df.columns:
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['month'] = df['order_date'].dt.month

# Categorize age into groups
if 'age' in df.columns:
    df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 65, 100], labels=['Youth', 'Young Adult', 'Adult', 'Senior'])

# ANOVA
def perform_anova(df, numerical, categorical):
    grouped = df.groupby(categorical)[numerical].apply(list)
    f_val, p_val = stats.f_oneway(*grouped)
    return f_val, p_val

# Chi-square test
def perform_chi_square(df, cat1, cat2):
    table = pd.crosstab(df[cat1], df[cat2])
    chi2, p = stats.chi2_contingency(table)[:2]
    return chi2, p

# ANOVA on price per unit across product types
f_val, p_val = perform_anova(df, 'price_per_unit', 'product_type')
print(f"ANOVA on price per unit across product types: F-value={f_val}, p-value={p_val}")

# Chi-square for product type and demographic groups
chi2, p = perform_chi_square(df, 'product_type', 'gender')
print(f"Chi-square for product type and gender: Chi2={chi2}, p-value={p}")

# Chi-square for product type across cities
if 'city' in df.columns:
    chi2, p = perform_chi_square(df, 'product_type', 'city')
    print(f"Chi-square for product type and city: Chi2={chi2}, p-value={p}")

# ANOVA for monthly sales
if 'month' in df.columns:
    f_val, p_val = perform_anova(df, 'quantity_x', 'month')
    print(f"ANOVA for quantity sold across months: F-value={f_val}, p-value={p_val}")


  import scipy.stats as stats


ANOVA on price per unit across product types: F-value=158.66390457543474, p-value=1.558232496968769e-67
Chi-square for product type and gender: Chi2=19.01887688289363, p-value=0.16422920443332853
Chi-square for product type and city: Chi2=1193.6851198108263, p-value=0.5940681608239126
ANOVA for quantity sold across months: F-value=0.7068397334048305, p-value=0.7032156607390802


In [None]:
import numpy as np
from plotly import express as px

ids, id_counts = np.unique(df["product_id"], return_counts=True)

fig = px.violin(df["product_id"], points="all")
fig.show()

ages, age_counts = np.unique(df["age"], return_counts=True)

fig = px.violin(df["age"], points="all")
fig.show()

genders, gender_counts = np.unique(df["gender"], return_counts=True)

fig = px.violin(df["gender"], points="all")
fig.show()




In [14]:
import numpy as np

num_customers = df_customers.shape[0]
num_products = df_products.shape[0]

print(df_sales["order_id"][0])
print(df_orders["customer_id"][1])

print(num_customers, "customers and", num_products, "products")

user_item_matrix = np.nan((num_customers, num_products))

display(df_sales.head())

for i in range(df_sales.shape[0]):
    order_id = df_sales["order_id"][i]-1
    customer_id = df_orders["customer_id"][order_id]
    product_id = df_sales["product_id"][i]

    user_item_matrix[customer_id, product_id] += 1

print(user_item_matrix.shape)
print(user_item_matrix[:5, :])




1
473
1000 customers and 1260 products


TypeError: 'float' object is not callable