In [20]:
import pandas as pd

# Load the CSV files into DataFrames
df_sales = pd.read_csv('data/sales.csv')
df_products = pd.read_csv('data/products.csv')
df_orders = pd.read_csv('data/orders.csv')
df_customers = pd.read_csv('data/customers.csv')

df_products.rename(columns={'product_ID': 'product_id'}, inplace=True)

# print(df_sales.head())
# print(df_sales.shape)
# print(df_products.head())
# print(df_products.shape)
# print(df_orders.head())
# print(df_orders.shape)
# print(df_customers.head())
# print(df_customers.shape)

df_merged = pd.merge(df_sales, df_products, on='product_id', how='inner')
df_merged = pd.merge(df_merged, df_orders, on='order_id', how='inner')
df_merged = pd.merge(df_merged, df_customers, on='customer_id', how='inner')


print(df_merged.head())
df_merged.to_csv('data/merged_data_raw.csv', index=False)
print(df_merged.shape)

   sales_id  order_id  product_id  price_per_unit  quantity_sold  total_price   
0         0         1         218             106              2          212  \
1         1         1         481             118              1          118   
2         2         1           2              96              3          288   
3         3         1        1002             106              2          212   
4         4         1         691             113              3          339   

  product_type  product_name size  colour  ...  order_date  delivery_date   
0        Shirt      Chambray    L  orange  ...   2021-8-30     2021-09-24  \
1       Jacket        Puffer    S  indigo  ...   2021-8-30     2021-09-24   
2        Shirt  Oxford Cloth    M     red  ...   2021-8-30     2021-09-24   
3     Trousers          Wool    M    blue  ...   2021-8-30     2021-09-24   
4       Jacket         Parka    S  indigo  ...   2021-8-30     2021-09-24   

      customer_name       gender  age             

In [21]:
# # Check for missing values
# print(df_merged.isnull().sum())

# # Check data types
# print(df_merged.dtypes)

# data cleaning
df_merged = df_merged.drop_duplicates()

# Convert 'order_date' and 'delivery_date' to datetime
df_merged['order_date'] = pd.to_datetime(df_merged['order_date'])
df_merged['delivery_date'] = pd.to_datetime(df_merged['delivery_date'])

# Example of a new feature: Days to deliver
df_merged['days_to_deliver'] = (df_merged['delivery_date'] - df_merged['order_date']).dt.days

# drop customer full name
df_merged.drop(columns=['customer_name'], inplace=True)
# drop delivery date and turn order date to seasons, add time to deliver
df_merged.drop(columns=['delivery_date'], inplace=True)
df_merged['order_date'] = pd.to_datetime(df_merged['order_date'])
df_merged['season'] = df_merged['order_date'].dt.quarter
# From products.csv, drop description
df_merged.drop(columns=['description'], inplace=True)
# From sales.csv, drop total price because it’s linear combination of quantity and unit price 
df_merged.drop(columns=['total_price'], inplace=True)
df_merged.to_csv('data/merged_data.csv', index=False)

# Display the adjusted DataFrame
print(df_merged)
df_merged.to_csv('data/merged_data.csv', index=False)



      sales_id  order_id  product_id  price_per_unit  quantity_sold   
0            0         1         218             106              2  \
1            1         1         481             118              1   
2            2         1           2              96              3   
3            3         1        1002             106              2   
4            4         1         691             113              3   
...        ...       ...         ...             ...            ...   
4995      2605       522         433             107              3   
4996      3422       690         919              96              3   
4997      3420       690         454             107              1   
4998      3421       690         296             105              3   
4999      3423       690         304             105              1   

     product_type  product_name size  colour  price  ...  order_date   
0           Shirt      Chambray    L  orange    105  ...  2021-08-30  \
1  

In [22]:
# numerical_vars = df_merged[['price_per_unit', 'quantity_sold', 'total_price', 'payment', 'age', 'days_to_deliver']]

# # Calculate correlation matrix
# correlation_matrix = numerical_vars.corr()

# print("Correlation Matrix:")
# print(correlation_matrix)

# # todo: non numerical variables

# import seaborn as sns
# import matplotlib.pyplot as plt

# # Plot heatmap of the correlation matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
# plt.title('Correlation Matrix Heatmap')
# plt.show()



In [23]:

# import scipy.stats as stats
# from itertools import combinations

# # Load your data
# df = pd.read_csv('data/merged_data.csv')

# # some transformation ideas
# # Convert date to datetime and extract month
# if 'order_date' in df.columns:
#     df['order_date'] = pd.to_datetime(df['order_date'])
#     df['month'] = df['order_date'].dt.month

# # Categorize age into groups
# if 'age' in df.columns:
#     df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 65, 100], labels=['Youth', 'Young Adult', 'Adult', 'Senior'])

# # ANOVA
# def perform_anova(df, numerical, categorical):
#     grouped = df.groupby(categorical)[numerical].apply(list)
#     f_val, p_val = stats.f_oneway(*grouped)
#     return f_val, p_val

# # Chi-square test
# def perform_chi_square(df, cat1, cat2):
#     table = pd.crosstab(df[cat1], df[cat2])
#     chi2, p = stats.chi2_contingency(table)[:2]
#     return chi2, p

# # ANOVA on price per unit across product types
# f_val, p_val = perform_anova(df, 'price_per_unit', 'product_type')
# print(f"ANOVA on price per unit across product types: F-value={f_val}, p-value={p_val}")

# # Chi-square for product type and demographic groups
# chi2, p = perform_chi_square(df, 'product_type', 'gender')
# print(f"Chi-square for product type and gender: Chi2={chi2}, p-value={p}")

# # Chi-square for product type across cities
# if 'city' in df.columns:
#     chi2, p = perform_chi_square(df, 'product_type', 'city')
#     print(f"Chi-square for product type and city: Chi2={chi2}, p-value={p}")

# # ANOVA for monthly sales
# if 'month' in df.columns:
#     f_val, p_val = perform_anova(df, 'quantity_sold', 'month')
#     print(f"ANOVA for quantity sold across months: F-value={f_val}, p-value={p_val}")
