# Analysis of company furniture data

We aim to get 
1. Top products
2. Top clients
3. Overall revenue performance

In [1]:
import pandas as pd

In [7]:
# Load the datasets
customers = pd.read_csv('datasets/data1/customers.csv')
products = pd.read_csv('datasets/data1/products.csv')
ratings = pd.read_csv('datasets/data1/ratings.csv')
orders = pd.read_csv('datasets/data1/orders.csv')
print(customers.head())
print(orders.head())
print(products.head())

   customer_id        name
0            1  Customer_1
1            2  Customer_2
2            3  Customer_3
3            4  Customer_4
4            5  Customer_5
   order_id  customer_id  product_id                  order_date  quantity
0         1           66          38  2023-08-09 08:39:23.971834         4
1         2           10          29  2023-09-08 08:39:23.971834         4
2         3           58           9  2023-07-29 08:39:23.971834         3
3         4           33          44  2023-09-13 08:39:23.971834         4
4         5           32          47  2023-07-24 08:39:23.971834         2
   product_id      product_name  price           category
0           1       Ektorp Sofa    694  Sofas & Armchairs
1           2    Poäng Armchair    569  Sofas & Armchairs
2           3  Klippan Loveseat    639  Sofas & Armchairs
3           4    Malm Bed Frame    202               Beds
4           5     Hemnes Daybed    845               Beds


In [4]:
# Merge datasets to get all relevant information
merged_data = pd.merge(orders, customers, on='customer_id')
merged_data = pd.merge(merged_data, products, on='product_id')
merged_data

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,name,product_name,price,category
0,1,66,38,2023-08-09 08:39:23.971834,4,Customer_66,Bekant Conference Table,441,Tables & Desks
1,429,66,38,2023-09-10 08:39:23.971834,3,Customer_66,Bekant Conference Table,441,Tables & Desks
2,199,24,38,2023-08-16 08:39:23.971834,1,Customer_24,Bekant Conference Table,441,Tables & Desks
3,326,29,38,2023-08-23 08:39:23.971834,3,Customer_29,Bekant Conference Table,441,Tables & Desks
4,237,35,38,2023-09-05 08:39:23.971834,1,Customer_35,Bekant Conference Table,441,Tables & Desks
...,...,...,...,...,...,...,...,...,...
995,311,72,14,2023-09-12 08:39:23.971834,4,Customer_72,Ingolf Bar Stool,609,Chairs
996,405,88,14,2023-08-16 08:39:23.971834,4,Customer_88,Ingolf Bar Stool,609,Chairs
997,679,88,14,2023-07-27 08:39:23.971834,3,Customer_88,Ingolf Bar Stool,609,Chairs
998,745,9,14,2023-08-29 08:39:23.971834,1,Customer_9,Ingolf Bar Stool,609,Chairs


In [17]:
# Convert order_date to datetime and filter data for the last month
merged_data['order_date'] = pd.to_datetime(merged_data['order_date'])
last_month_data = merged_data[merged_data['order_date'] >= pd.Timestamp.now() - pd.DateOffset(months=10)]

In [18]:
# Calculate revenue for each product
last_month_data['revenue'] = last_month_data['price'] * last_month_data['quantity']

In [19]:
# Calculate total quantity sold for each product
product_quantity_sold = last_month_data.groupby('product_id')['quantity'].sum().reset_index()

In [20]:
# Calculate total revenue for each product
product_revenue = last_month_data.groupby('product_id')['revenue'].sum().reset_index()

In [21]:
# Identify top performing products based on revenue
top_products_revenue = product_revenue.sort_values(by='revenue', ascending=False).head()
top_products_revenue

Unnamed: 0,product_id,revenue
5,6,59521
41,42,51688
28,29,50445
25,26,46386
30,31,45646


In [22]:
# Identify top performing products based on quantity sold
top_products_quantity = product_quantity_sold.sort_values(by='quantity', ascending=False).head()

In [23]:
# Identify top clients for the last month
top_clients = last_month_data.groupby('customer_id')['revenue'].sum().reset_index()
top_clients = top_clients.sort_values(by='revenue', ascending=False).head()

In [24]:
print("Top performing products by revenue:")
print(top_products_revenue)
print("\nTop performing products by quantity sold:")
print(top_products_quantity)
print("\nTop clients for the last month:")
print(top_clients)

Top performing products by revenue:
    product_id  revenue
5            6    59521
41          42    51688
28          29    50445
25          26    46386
30          31    45646

Top performing products by quantity sold:
    product_id  quantity
38          39        80
5            6        77
48          49        63
1            2        63
37          38        62

Top clients for the last month:
    customer_id  revenue
0             1    27081
43           44    26529
3             4    25717
28           29    24997
51           52    24055


# Customer Segmentation

In [25]:
# Calculate total spend per customer
customer_total_spend = merged_data.groupby('customer_id')['price'].sum().reset_index()
customer_total_spend.columns = ['customer_id', 'total_spend']

# Calculate frequency of purchase per customer
customer_purchase_frequency = merged_data.groupby('customer_id')['order_id'].count().reset_index()
customer_purchase_frequency.columns = ['customer_id', 'purchase_frequency']

# Identify most purchased product categories per customer
customer_product_preferences = merged_data.groupby(['customer_id', 'category'])['order_id'].count().reset_index()
customer_product_preferences = customer_product_preferences.sort_values(by=['customer_id', 'order_id'], ascending=[True, False])
customer_product_preferences = customer_product_preferences.groupby('customer_id').head(1).reset_index(drop=True)

# Merge all calculated metrics
customer_metrics = pd.merge(customer_total_spend, customer_purchase_frequency, on='customer_id')
customer_metrics = pd.merge(customer_metrics, customer_product_preferences, on='customer_id')

# Customer Segmentation based on spend and frequency
# You can adjust these thresholds based on your specific business needs
high_value_customers = customer_metrics[(customer_metrics['total_spend'] > 1000) & (customer_metrics['purchase_frequency'] > 5)]
medium_value_customers = customer_metrics[(customer_metrics['total_spend'] <= 1000) & (customer_metrics['purchase_frequency'] > 1)]
low_value_customers = customer_metrics[customer_metrics['purchase_frequency'] <= 1]

# Print out the segmented customers
print("High-Value Customers:")
print(high_value_customers.head())
print("\nMedium-Value Customers:")
print(medium_value_customers.head())
print("\nLow-Value Customers:")
print(low_value_customers.head())

High-Value Customers:
   customer_id  total_spend  purchase_frequency           category  order_id
0            1        12106                  18              Decor         4
1            2         4243                   9              Decor         2
2            3         7395                  13  Sofas & Armchairs         3
3            4        10865                  22             Chairs         5
4            5         7175                  11             Chairs         3

Medium-Value Customers:
Empty DataFrame
Columns: [customer_id, total_spend, purchase_frequency, category, order_id]
Index: []

Low-Value Customers:
Empty DataFrame
Columns: [customer_id, total_spend, purchase_frequency, category, order_id]
Index: []
