In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame([{'product_id':23, 'name':'computer',
                 'wholesale_price': 500,
                 'retail_price':1000, 'sales':100,
                 'department':'electronics'},
               {'product_id':96, 'name':'Python Workout',
                 'wholesale_price': 35,
                 'retail_price':75, 'sales':1000,
                 'department':'books'},
               {'product_id':97, 'name':'Pandas Workout',
                 'wholesale_price': 35,
                 'retail_price':75, 'sales':500,
                 'department':'books'},
               {'product_id':15, 'name':'banana',
                 'wholesale_price': 0.5,
                 'retail_price':1, 'sales':200,
                 'department':'food'},
               {'product_id':87, 'name':'sandwich',
                 'wholesale_price': 3,
                 'retail_price':5, 'sales':300,
                 'department': 'food'},
               ])

In [3]:
df

Unnamed: 0,product_id,name,wholesale_price,retail_price,sales,department
0,23,computer,500.0,1000,100,electronics
1,96,Python Workout,35.0,75,1000,books
2,97,Pandas Workout,35.0,75,500,books
3,15,banana,0.5,1,200,food
4,87,sandwich,3.0,5,300,food


But now consider that instead of keeping track of sales numbers in this data frame, we instead break the data into two parts:

- One data frame will describe each of the products we sell.
- A second data frame will describe each sale we make.

In [4]:
products_df = pd.DataFrame([{'product_id':23, 'name':'computer',
                          'wholesale_price': 500,
                          'retail_price':1000,
                          'department':'electronics'},
                        {'product_id':96, 'name':'Python Workout',
                         'wholesale_price': 35,
                         'retail_price':75, 'department':'books'},
                        {'product_id':97, 'name':'Pandas Workout',
                         'wholesale_price': 35,
                         'retail_price':75, 'department':'books'},
                        {'product_id':15, 'name':'banana',
                         'wholesale_price': 0.5,
                         'retail_price':1, 'department':'food'},
                        {'product_id':87, 'name':'sandwich',
                        'wholesale_price': 3,
                         'retail_price':5, 'department': 'food'},
                        ])
sales_df = pd.DataFrame([{'product_id': 23, 'date':'2021-August-10',
                       'quantity':1},
                     {'product_id': 96, 'date':'2021-August-10',
                       'quantity':5},
                     {'product_id': 15, 'date':'2021-August-10',
                       'quantity':3},
                     {'product_id': 87, 'date':'2021-August-10',
                       'quantity':2},
                     {'product_id': 15, 'date':'2021-August-11',
                       'quantity':1},
                     {'product_id': 96, 'date':'2021-August-11',
                       'quantity':1},
                     {'product_id': 23, 'date':'2021-August-11',
                       'quantity':2},
                     {'product_id': 87, 'date':'2021-August-12',
                       'quantity':2},
                     {'product_id': 97, 'date':'2021-August-12',
                       'quantity':6},
                     {'product_id': 97, 'date':'2021-August-12',
                       'quantity':1},
                     {'product_id': 87, 'date':'2021-August-13',
                       'quantity':2},
                     {'product_id': 23, 'date':'2021-August-13',
                       'quantity':1},
                     {'product_id': 15, 'date':'2021-August-14',
                       'quantity':2}
                     ])

In [5]:
products_df

Unnamed: 0,product_id,name,wholesale_price,retail_price,department
0,23,computer,500.0,1000,electronics
1,96,Python Workout,35.0,75,books
2,97,Pandas Workout,35.0,75,books
3,15,banana,0.5,1,food
4,87,sandwich,3.0,5,food


In [6]:
sales_df

Unnamed: 0,product_id,date,quantity
0,23,2021-August-10,1
1,96,2021-August-10,5
2,15,2021-August-10,3
3,87,2021-August-10,2
4,15,2021-August-11,1
5,96,2021-August-11,1
6,23,2021-August-11,2
7,87,2021-August-12,2
8,97,2021-August-12,6
9,97,2021-August-12,1


This is all well and good, but how can we describe how much of each product has been sold? This is where joining comes in: we can combine products_df and sales_df into a new, single data frame that contains all the columns from both of the input data frames.

But wait a secondâ€”how does pandas know which rows on the left should be joined with which rows on the right? The answer, at least by default, is that it uses the index. Wherever the index of the left side matches the index of the right side

In [8]:
sales_df = sales_df.set_index('product_id')
products_df = products_df.set_index('product_id')

In [9]:
sales_df.index

Index([23, 96, 15, 87, 15, 96, 23, 87, 97, 97, 87, 23, 15], dtype='int64', name='product_id')

In [10]:
products_df.index

Index([23, 96, 97, 15, 87], dtype='int64', name='product_id')

In [15]:
products_df.join(sales_df)

Unnamed: 0_level_0,name,wholesale_price,retail_price,department,date,quantity
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23,computer,500.0,1000,electronics,2021-August-10,1
23,computer,500.0,1000,electronics,2021-August-11,2
23,computer,500.0,1000,electronics,2021-August-13,1
96,Python Workout,35.0,75,books,2021-August-10,5
96,Python Workout,35.0,75,books,2021-August-11,1
97,Pandas Workout,35.0,75,books,2021-August-12,6
97,Pandas Workout,35.0,75,books,2021-August-12,1
15,banana,0.5,1,food,2021-August-10,3
15,banana,0.5,1,food,2021-August-11,1
15,banana,0.5,1,food,2021-August-14,2


We can now perform whatever queries we like on this new, combined data frame. For example, we can determine how many of each product have been sold:

In [20]:
products_df.join(sales_df).groupby('name')['quantity'].sum()

name
Pandas Workout    7
Python Workout    6
banana            6
computer          4
sandwich          6
Name: quantity, dtype: int64

In [21]:
products_df.join(sales_df).groupby('name')['quantity'].count()

name
Pandas Workout    2
Python Workout    2
banana            3
computer          3
sandwich          3
Name: quantity, dtype: int64

Or we can determine how much income we get from each product and then sort the products from lowest to highest source of income:

In [23]:
products_df.join(sales_df).groupby('name')['retail_price'].sum().sort_values()

name
banana               3
sandwich            15
Python Workout     150
Pandas Workout     150
computer          3000
Name: retail_price, dtype: int64

We can even determine how much income we had on each individual day:

In [25]:
products_df.join(sales_df).groupby('date')['retail_price'].sum().sort_values()

date
2021-August-14       1
2021-August-12     155
2021-August-13    1005
2021-August-11    1076
2021-August-10    1081
Name: retail_price, dtype: int64

In [26]:
products_df.join(sales_df).groupby('date')['retail_price'].count().sort_values()

date
2021-August-14    1
2021-August-13    2
2021-August-12    3
2021-August-11    3
2021-August-10    4
Name: retail_price, dtype: int64

And although our data set is tiny, we can determine how much each product contributed to our income per day:

In [32]:
products_df.join(sales_df).groupby(['date', 'name'])['retail_price'].sum()

date            name          
2021-August-10  Python Workout      75
                banana               1
                computer          1000
                sandwich             5
2021-August-11  Python Workout      75
                banana               1
                computer          1000
2021-August-12  Pandas Workout     150
                sandwich             5
2021-August-13  computer          1000
                sandwich             5
2021-August-14  banana               1
Name: retail_price, dtype: int64

In [35]:
products_df.join(sales_df).groupby(['date', 'name']).agg(
    total_retail = ('retail_price', 'sum'),
    total_amount = ('quantity', 'sum')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_retail,total_amount
date,name,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-August-10,Python Workout,75,5
2021-August-10,banana,1,3
2021-August-10,computer,1000,1
2021-August-10,sandwich,5,2
2021-August-11,Python Workout,75,1
2021-August-11,banana,1,1
2021-August-11,computer,1000,2
2021-August-12,Pandas Workout,150,7
2021-August-12,sandwich,5,2
2021-August-13,computer,1000,1
