# Merging DataFrames

* Combining DataFrames based on the values in the shared columns
* Similar to SQL Joins

<img src="Assets/merge.png" class="juno_ui_theme_light" style="width:800px">

## Exercise 1 - create DataFrames

In [1]:
import numpy as np
import pandas as pd


names = pd.DataFrame(
    
    {
        "id": [1, 2, 3, 4, 10],
        "name": ["Emily", "Jane", "Joe", "Matt", "Lucas"],
        "age": np.random.randint(20, 30, size=5)
    }
    
)

scores = pd.DataFrame(
    
    {
        "id": np.arange(1, 8),
        "score": np.random.randint(80, 100, size=7),
        "group": list("ABCAACA")
    }
    
)

In [2]:
# Display DataFrames side-by-side
# Search for Jupyter notebook display two pandas tables side by side

from IPython.display import display_html
from itertools import chain,cycle

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [3]:
display_side_by_side(names, scores, titles=["names", "scores"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A


## Exercise 2 - merge function

In [4]:
merged_df = names.merge(scores, on="id")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group
0,1,Emily,23,91,A
1,2,Jane,26,95,B
2,3,Joe,26,81,C
3,4,Matt,28,99,A


## Exercise 3 - how parameter

* Indicates to column or columns to check
* Default value is inner

In [5]:
names.merge(scores, on="id", how="inner")

Unnamed: 0,id,name,age,score,group
0,1,Emily,23,91,A
1,2,Jane,26,95,B
2,3,Joe,26,81,C
3,4,Matt,28,99,A


In [6]:
df_merged = names.merge(scores, on="id", how="inner")

df_merged

Unnamed: 0,id,name,age,score,group
0,1,Emily,23,91,A
1,2,Jane,26,95,B
2,3,Joe,26,81,C
3,4,Matt,28,99,A


## Exercise 4 - left

In [7]:
merged_df = names.merge(scores, on="id", how="left")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group
0,1,Emily,23,91.0,A
1,2,Jane,26,95.0,B
2,3,Joe,26,81.0,C
3,4,Matt,28,99.0,A
4,10,Lucas,21,,


In [8]:
merged_df = names.merge(scores, on="id", how="outer")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group
0,1,Emily,23.0,91.0,A
1,2,Jane,26.0,95.0,B
2,3,Joe,26.0,81.0,C
3,4,Matt,28.0,99.0,A
4,10,Lucas,21.0,,
5,5,,,81.0,A
6,6,,,97.0,C
7,7,,,97.0,A


## Exercise 5 - right

In [9]:
names.merge(scores, on="id", how="right")

Unnamed: 0,id,name,age,score,group
0,1,Emily,23.0,91,A
1,2,Jane,26.0,95,B
2,3,Joe,26.0,81,C
3,4,Matt,28.0,99,A
4,5,,,81,A
5,6,,,97,C
6,7,,,97,A


* Not suggesting to use right
* Same thing can be achieved by changing the order of the DataFrames

In [10]:
#names.merge(scores, on="id", how="right")
scores.merge(names, on="id", how="left")

Unnamed: 0,id,score,group,name,age
0,1,91,A,Emily,23.0
1,2,95,B,Jane,26.0
2,3,81,C,Joe,26.0
3,4,99,A,Matt,28.0
4,5,81,A,,
5,6,97,C,,
6,7,97,A,,


## Exercise 6 - indicator parameter

In [11]:
display_side_by_side(names, scores, titles=["names", "scores"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A


In [12]:
names.merge(scores, on="id", indicator=True)

Unnamed: 0,id,name,age,score,group,_merge
0,1,Emily,23,91,A,both
1,2,Jane,26,95,B,both
2,3,Joe,26,81,C,both
3,4,Matt,28,99,A,both


## Exercise 7 - indicator parameter

In [13]:
merged_df = names.merge(scores, on="id", how="outer", indicator=True)

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group,_merge
0,1,Emily,23.0,91.0,A,both
1,2,Jane,26.0,95.0,B,both
2,3,Joe,26.0,81.0,C,both
3,4,Matt,28.0,99.0,A,both
4,10,Lucas,21.0,,,left_only
5,5,,,81.0,A,right_only
6,6,,,97.0,C,right_only
7,7,,,97.0,A,right_only


## Exercise 8 - indicator parameter

In [14]:
merged_df = names.merge(scores, on="id", how="left", indicator="source")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group,source
0,1,Emily,23,91.0,A,both
1,2,Jane,26,95.0,B,both
2,3,Joe,26,81.0,C,both
3,4,Matt,28,99.0,A,both
4,10,Lucas,21,,,left_only


## Exercise 9 - outer

In [15]:
merged_df = names.merge(scores, on="id", how="outer")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,score,group
0,1,Emily,23.0,91.0,A
1,2,Jane,26.0,95.0,B
2,3,Joe,26.0,81.0,C
3,4,Matt,28.0,99.0,A
4,10,Lucas,21.0,,
5,5,,,81.0,A
6,6,,,97.0,C
7,7,,,97.0,A


## Exercise 10 - merge on different column names

In [16]:
scores = scores.rename(columns={"id": "id_number"})

scores

Unnamed: 0,id_number,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A


In [17]:
merged_df = names.merge(scores, left_on="id", right_on="id_number")

display_side_by_side(names, scores, merged_df, titles=["names", "scores", "merged_df"])

Unnamed: 0,id,name,age
0,1,Emily,23
1,2,Jane,26
2,3,Joe,26
3,4,Matt,28
4,10,Lucas,21

Unnamed: 0,id_number,score,group
0,1,91,A
1,2,95,B
2,3,81,C
3,4,99,A
4,5,81,A
5,6,97,C
6,7,97,A

Unnamed: 0,id,name,age,id_number,score,group
0,1,Emily,23,1,91,A
1,2,Jane,26,2,95,B
2,3,Joe,26,3,81,C
3,4,Matt,28,4,99,A


## Exercise 11 - merge on different column names

In [18]:
names.merge(scores, left_on="id", right_on="id_number", how="left")

Unnamed: 0,id,name,age,id_number,score,group
0,1,Emily,23,1.0,91.0,A
1,2,Jane,26,2.0,95.0,B
2,3,Joe,26,3.0,81.0,C
3,4,Matt,28,4.0,99.0,A
4,10,Lucas,21,,,


## Exercise 12 - create DataFrames

In [19]:
products = pd.DataFrame(
    
    {
        "pg": ["A", "A", "A", "B", "B", "B"],
        "id": [101, 102, 103, 101, 102, 104],
        "price": np.random.randint(50, 80, size=6),
        "cost": np.random.randint(40, 50, size=6),
        "discount": [0.1, 0.1, 0, 0, 0.2, 0]
    }
    
)

sales = pd.DataFrame(
    
    {
        "pg": ["A", "A", "A", "B", "B", "B"],
        "id": [101, 102, 105, 101, 102, 106],
        "sales_qty": np.random.randint(1, 10, size=6),
        "discount": [0, 0.1, 0.1, 0.2, 0, 0]
    }
    
)

In [20]:
display_side_by_side(products, sales, titles=["products", "sales"])

Unnamed: 0,pg,id,price,cost,discount
0,A,101,58,42,0.1
1,A,102,70,42,0.1
2,A,103,50,45,0.0
3,B,101,73,48,0.0
4,B,102,55,46,0.2
5,B,104,50,43,0.0

Unnamed: 0,pg,id,sales_qty,discount
0,A,101,3,0.0
1,A,102,1,0.1
2,A,105,5,0.1
3,B,101,2,0.2
4,B,102,6,0.0
5,B,106,2,0.0


## Exercise 13 - merge on multiple columns

In [21]:
merged_df = products.merge(sales, on=["pg", "id"])

display_side_by_side(products, sales, merged_df, titles=["products", "sales", "merged_df"])

Unnamed: 0,pg,id,price,cost,discount
0,A,101,58,42,0.1
1,A,102,70,42,0.1
2,A,103,50,45,0.0
3,B,101,73,48,0.0
4,B,102,55,46,0.2
5,B,104,50,43,0.0

Unnamed: 0,pg,id,sales_qty,discount
0,A,101,3,0.0
1,A,102,1,0.1
2,A,105,5,0.1
3,B,101,2,0.2
4,B,102,6,0.0
5,B,106,2,0.0

Unnamed: 0,pg,id,price,cost,discount_x,sales_qty,discount_y
0,A,101,58,42,0.1,3,0.0
1,A,102,70,42,0.1,1,0.1
2,B,101,73,48,0.0,2,0.2
3,B,102,55,46,0.2,6,0.0


## Exercise 14 - suffixes

In [22]:
merged_df = products.merge(sales, on=["pg", "id"], suffixes=["_products", "_sales"])

display_side_by_side(products, sales, merged_df, titles=["products", "sales", "merged_df"])

Unnamed: 0,pg,id,price,cost,discount
0,A,101,58,42,0.1
1,A,102,70,42,0.1
2,A,103,50,45,0.0
3,B,101,73,48,0.0
4,B,102,55,46,0.2
5,B,104,50,43,0.0

Unnamed: 0,pg,id,sales_qty,discount
0,A,101,3,0.0
1,A,102,1,0.1
2,A,105,5,0.1
3,B,101,2,0.2
4,B,102,6,0.0
5,B,106,2,0.0

Unnamed: 0,pg,id,price,cost,discount_products,sales_qty,discount_sales
0,A,101,58,42,0.1,3,0.0
1,A,102,70,42,0.1,1,0.1
2,B,101,73,48,0.0,2,0.2
3,B,102,55,46,0.2,6,0.0


In [23]:
products.merge(sales, on=["pg", "id"], how="left", suffixes=["_products", "_sales"])

Unnamed: 0,pg,id,price,cost,discount_products,sales_qty,discount_sales
0,A,101,58,42,0.1,3.0,0.0
1,A,102,70,42,0.1,1.0,0.1
2,A,103,50,45,0.0,,
3,B,101,73,48,0.0,2.0,0.2
4,B,102,55,46,0.2,6.0,0.0
5,B,104,50,43,0.0,,


## Exercise 15 - different column names

In [24]:
sales = sales.rename(columns={"id": "product_id"})

sales

Unnamed: 0,pg,product_id,sales_qty,discount
0,A,101,3,0.0
1,A,102,1,0.1
2,A,105,5,0.1
3,B,101,2,0.2
4,B,102,6,0.0
5,B,106,2,0.0


In [25]:
merged_df = products.merge(
    sales, 
    left_on=["pg", "id"], 
    right_on=["pg", "product_id"],
    how="left",
    suffixes=["_products", "_sales"]
)

display_side_by_side(products, sales, merged_df, titles=["products", "sales", "merged_df"])

Unnamed: 0,pg,id,price,cost,discount
0,A,101,58,42,0.1
1,A,102,70,42,0.1
2,A,103,50,45,0.0
3,B,101,73,48,0.0
4,B,102,55,46,0.2
5,B,104,50,43,0.0

Unnamed: 0,pg,product_id,sales_qty,discount
0,A,101,3,0.0
1,A,102,1,0.1
2,A,105,5,0.1
3,B,101,2,0.2
4,B,102,6,0.0
5,B,106,2,0.0

Unnamed: 0,pg,id,price,cost,discount_products,product_id,sales_qty,discount_sales
0,A,101,58,42,0.1,101.0,3.0,0.0
1,A,102,70,42,0.1,102.0,1.0,0.1
2,A,103,50,45,0.0,,,
3,B,101,73,48,0.0,101.0,2.0,0.2
4,B,102,55,46,0.2,102.0,6.0,0.0
5,B,104,50,43,0.0,,,


## Exercise 16 - create DataFrames

In [26]:
df1 = pd.DataFrame(
    np.random.randint(0, 10, size=(5, 4)),
    columns=list("ABCD")
)

df2 = pd.DataFrame(
    np.random.randint(0, 10, size=(5, 4)),
    columns=list("EFGH"),
    index=[2, 3, 4, 5, 6]
)

In [27]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C,D
0,0,5,7,4
1,1,9,7,6
2,7,2,9,2
3,5,6,0,3
4,6,0,7,2

Unnamed: 0,E,F,G,H
2,8,7,0,6
3,3,3,9,2
4,1,7,2,3
5,4,2,2,7
6,9,0,2,6


## Exercise 17 - merge on index

In [28]:
merged_df = df1.merge(df2, left_index=True, right_index=True)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,A,B,C,D
0,0,5,7,4
1,1,9,7,6
2,7,2,9,2
3,5,6,0,3
4,6,0,7,2

Unnamed: 0,E,F,G,H
2,8,7,0,6
3,3,3,9,2
4,1,7,2,3
5,4,2,2,7
6,9,0,2,6

Unnamed: 0,A,B,C,D,E,F,G,H
2,7,2,9,2,8,7,0,6
3,5,6,0,3,3,3,9,2
4,6,0,7,2,1,7,2,3


## Exercise 18 - merge on index

In [29]:
merged_df = df1.merge(df2, left_index=True, right_index=True, how="left")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,A,B,C,D
0,0,5,7,4
1,1,9,7,6
2,7,2,9,2
3,5,6,0,3
4,6,0,7,2

Unnamed: 0,E,F,G,H
2,8,7,0,6
3,3,3,9,2
4,1,7,2,3
5,4,2,2,7
6,9,0,2,6

Unnamed: 0,A,B,C,D,E,F,G,H
0,0,5,7,4,,,,
1,1,9,7,6,,,,
2,7,2,9,2,8.0,7.0,0.0,6.0
3,5,6,0,3,3.0,3.0,9.0,2.0
4,6,0,7,2,1.0,7.0,2.0,3.0


In [30]:
df1.merge(df2, left_index=True, right_index=True, how="outer")

Unnamed: 0,A,B,C,D,E,F,G,H
0,0.0,5.0,7.0,4.0,,,,
1,1.0,9.0,7.0,6.0,,,,
2,7.0,2.0,9.0,2.0,8.0,7.0,0.0,6.0
3,5.0,6.0,0.0,3.0,3.0,3.0,9.0,2.0
4,6.0,0.0,7.0,2.0,1.0,7.0,2.0,3.0
5,,,,,4.0,2.0,2.0,7.0
6,,,,,9.0,0.0,2.0,6.0


# Merge "not matching" time-series data

* Time-series data might include measurements taken at very short time periods (e.g. at the level of seconds).
* Therefore, when we merge two DataFrames consist of time series data, we may encounter measurements off by a second or two. 
* For such cases, Pandas provides a “smart” way of merging done by `merge_asof`.
* Assume we are merging DataFrames A and B. If a row in the left DataFrame does not have a matching row in the right DataFrame, `merge_asof` allows for taking a row whose value is close to the value in left DataFrame.

## Exercise 19 - create date range

In [31]:
pd.date_range(start="2022-12-09", periods=10, freq="S")

DatetimeIndex(['2022-12-09 00:00:00', '2022-12-09 00:00:01',
               '2022-12-09 00:00:02', '2022-12-09 00:00:03',
               '2022-12-09 00:00:04', '2022-12-09 00:00:05',
               '2022-12-09 00:00:06', '2022-12-09 00:00:07',
               '2022-12-09 00:00:08', '2022-12-09 00:00:09'],
              dtype='datetime64[ns]', freq='S')

In [32]:
pd.date_range(start="2022-12-09", periods=20, freq="2S")

DatetimeIndex(['2022-12-09 00:00:00', '2022-12-09 00:00:02',
               '2022-12-09 00:00:04', '2022-12-09 00:00:06',
               '2022-12-09 00:00:08', '2022-12-09 00:00:10',
               '2022-12-09 00:00:12', '2022-12-09 00:00:14',
               '2022-12-09 00:00:16', '2022-12-09 00:00:18',
               '2022-12-09 00:00:20', '2022-12-09 00:00:22',
               '2022-12-09 00:00:24', '2022-12-09 00:00:26',
               '2022-12-09 00:00:28', '2022-12-09 00:00:30',
               '2022-12-09 00:00:32', '2022-12-09 00:00:34',
               '2022-12-09 00:00:36', '2022-12-09 00:00:38'],
              dtype='datetime64[ns]', freq='2S')

## Exercise 20 - create DataFrames

In [33]:
df1 = pd.DataFrame(
    
    {
        "time": pd.date_range(start="2022-12-09", periods=7, freq="2S"),
        "left_value": np.round(np.random.random(7), 2)
    }

)

df2 = pd.DataFrame(
    
    {
        "time": pd.date_range(start="2022-12-09", periods=6, freq="3S"),
        "right_value": np.round(np.random.random(6), 2)
    }

)

In [34]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25


## Exercise 21 - merge_asof

For each row in the left DataFrame:

* A “backward” search selects the last row in the right DataFrame whose ‘on’ key is less than or equal to the left’s key.

* A “forward” search selects the first row in the right DataFrame whose ‘on’ key is greater than or equal to the left’s key.

* A “nearest” search selects the row in the right DataFrame whose ‘on’ key is closest in absolute distance to the left’s key.

In [35]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25


In [36]:
merged_df = pd.merge_asof(df1, df2, on="time")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.47
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.39
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 22 - default direction

In [37]:
merged_df = pd.merge_asof(df1, df2, on="time", direction="backward")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.47
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.39
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 23 - different column names

In [38]:
df2 = df2.rename(columns={"time": "record_time"})

In [39]:
merged_df = pd.merge_asof(df1, df2, left_on="time", right_on="record_time")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,record_time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,record_time,right_value
0,2022-12-09 00:00:00,0.3,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:02,0.04,2022-12-09 00:00:00,0.47
2,2022-12-09 00:00:04,0.62,2022-12-09 00:00:03,0.72
3,2022-12-09 00:00:06,0.7,2022-12-09 00:00:06,0.39
4,2022-12-09 00:00:08,0.43,2022-12-09 00:00:06,0.39
5,2022-12-09 00:00:10,0.72,2022-12-09 00:00:09,0.46
6,2022-12-09 00:00:12,0.28,2022-12-09 00:00:12,0.18


In [40]:
df2 = df2.rename(columns={"record_time": "time"})

## Exercise 24 - Timedelta

In [41]:
pd.Timedelta("1s")

Timedelta('0 days 00:00:01')

In [42]:
pd.Timedelta("2s")

Timedelta('0 days 00:00:02')

In [43]:
pd.Timedelta("1m")

Timedelta('0 days 00:01:00')

## Exercise 25 - tolerance

In [44]:
merged_df = pd.merge_asof(df1, df2, on="time")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.47
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.39
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


In [45]:
merged_df = pd.merge_asof(df1, df2, on="time", tolerance=pd.Timedelta("1s"))

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 26 - direction parameter

* Whether to search for prior (backward), subsequent (forward), or closest (nearest) matches.
* Default value is backward

In [46]:
merged_df = pd.merge_asof(df1, df2, on="time", direction="forward")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.72
2,2022-12-09 00:00:04,0.62,0.39
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.46
5,2022-12-09 00:00:10,0.72,0.18
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 27 - direction parameter

In [47]:
merged_df = pd.merge_asof(df1, df2, on="time", direction="nearest")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.72
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.46
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 28 - direction and tolerance

In [48]:
merged_df = pd.merge_asof(
    df1, 
    df2, 
    on="time", 
    direction="forward", 
    tolerance=pd.Timedelta("1s")
)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.72
2,2022-12-09 00:00:04,0.62,
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.46
5,2022-12-09 00:00:10,0.72,
6,2022-12-09 00:00:12,0.28,0.18


In [49]:
merged_df = pd.merge_asof(
    df1, 
    df2, 
    on="time", 
    direction="nearest", 
    tolerance=pd.Timedelta("1s")
)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,0.47
1,2022-12-09 00:00:02,0.04,0.72
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.39
4,2022-12-09 00:00:08,0.43,0.46
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.18


## Exercise 29 - allow_exact_matches

* Default value is True
* If True, allow matching with the same ‘on’ value (i.e. less-than-or-equal-to / greater-than-or-equal-to)
* If False, don’t match the same ‘on’ value (i.e., strictly less-than / strictly greater-than).

In [50]:
merged_df = pd.merge_asof(
    df1, 
    df2, 
    on="time", 
    allow_exact_matches=False
)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value
0,2022-12-09 00:00:00,0.3
1,2022-12-09 00:00:02,0.04
2,2022-12-09 00:00:04,0.62
3,2022-12-09 00:00:06,0.7
4,2022-12-09 00:00:08,0.43
5,2022-12-09 00:00:10,0.72
6,2022-12-09 00:00:12,0.28

Unnamed: 0,time,right_value
0,2022-12-09 00:00:00,0.47
1,2022-12-09 00:00:03,0.72
2,2022-12-09 00:00:06,0.39
3,2022-12-09 00:00:09,0.46
4,2022-12-09 00:00:12,0.18
5,2022-12-09 00:00:15,0.25

Unnamed: 0,time,left_value,right_value
0,2022-12-09 00:00:00,0.3,
1,2022-12-09 00:00:02,0.04,0.47
2,2022-12-09 00:00:04,0.62,0.72
3,2022-12-09 00:00:06,0.7,0.72
4,2022-12-09 00:00:08,0.43,0.39
5,2022-12-09 00:00:10,0.72,0.46
6,2022-12-09 00:00:12,0.28,0.46


## Exercise 30 - by parameter

In [51]:
df1["group"] = ["AA"] * 4 + ["BB"] * 3

df2["group"] = ["AA"] * 3 + ["BB"] * 3

display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB


In [52]:
merged_df = pd.merge_asof(df1, df2, on="time", by="group")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB

Unnamed: 0,time,left_value,group,right_value
0,2022-12-09 00:00:00,0.3,AA,0.47
1,2022-12-09 00:00:02,0.04,AA,0.47
2,2022-12-09 00:00:04,0.62,AA,0.72
3,2022-12-09 00:00:06,0.7,AA,0.39
4,2022-12-09 00:00:08,0.43,BB,
5,2022-12-09 00:00:10,0.72,BB,0.46
6,2022-12-09 00:00:12,0.28,BB,0.18


## Exercise 31 - Financial data

* This execise is from Pandas documentation

In [53]:
quotes = pd.DataFrame(
    {
        "time": [
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.030"),
            pd.Timestamp("2016-05-25 13:30:00.041"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
            pd.Timestamp("2016-05-25 13:30:00.049"),
            pd.Timestamp("2016-05-25 13:30:00.072"),
            pd.Timestamp("2016-05-25 13:30:00.075")
        ],
        "ticker": [
               "GOOG",
               "MSFT",
               "MSFT",
               "MSFT",
               "GOOG",
               "AAPL",
               "GOOG",
               "MSFT"
           ],
           "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
           "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
    }
)

trades = pd.DataFrame(
       {
           "time": [
               pd.Timestamp("2016-05-25 13:30:00.023"),
               pd.Timestamp("2016-05-25 13:30:00.038"),
               pd.Timestamp("2016-05-25 13:30:00.048"),
               pd.Timestamp("2016-05-25 13:30:00.048"),
               pd.Timestamp("2016-05-25 13:30:00.048")
           ],
           "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
           "price": [51.95, 51.95, 720.77, 720.92, 98.0],
           "quantity": [75, 155, 100, 100, 100]
       }
   )

In [54]:
display_side_by_side(trades, quotes, titles=["trades", "quotes"])

Unnamed: 0,time,ticker,price,quantity
0,2016-05-25 13:30:00.023,MSFT,51.95,75
1,2016-05-25 13:30:00.038,MSFT,51.95,155
2,2016-05-25 13:30:00.048,GOOG,720.77,100
3,2016-05-25 13:30:00.048,GOOG,720.92,100
4,2016-05-25 13:30:00.048,AAPL,98.0,100

Unnamed: 0,time,ticker,bid,ask
0,2016-05-25 13:30:00.023,GOOG,720.5,720.93
1,2016-05-25 13:30:00.023,MSFT,51.95,51.96
2,2016-05-25 13:30:00.030,MSFT,51.97,51.98
3,2016-05-25 13:30:00.041,MSFT,51.99,52.0
4,2016-05-25 13:30:00.048,GOOG,720.5,720.93
5,2016-05-25 13:30:00.049,AAPL,97.99,98.01
6,2016-05-25 13:30:00.072,GOOG,720.5,720.88
7,2016-05-25 13:30:00.075,MSFT,52.01,52.03


In [55]:
pd.merge_asof(trades, quotes, on="time", by="ticker")

Unnamed: 0,time,ticker,price,quantity,bid,ask
0,2016-05-25 13:30:00.023,MSFT,51.95,75,51.95,51.96
1,2016-05-25 13:30:00.038,MSFT,51.95,155,51.97,51.98
2,2016-05-25 13:30:00.048,GOOG,720.77,100,720.5,720.93
3,2016-05-25 13:30:00.048,GOOG,720.92,100,720.5,720.93
4,2016-05-25 13:30:00.048,AAPL,98.0,100,,


## Exercise 32 - Financial data

In [56]:
merged_df = pd.merge_asof(
    trades, 
    quotes, 
    on="time", 
    by="ticker",
    tolerance=pd.Timedelta("5ms")
)

merged_df

Unnamed: 0,time,ticker,price,quantity,bid,ask
0,2016-05-25 13:30:00.023,MSFT,51.95,75,51.95,51.96
1,2016-05-25 13:30:00.038,MSFT,51.95,155,,
2,2016-05-25 13:30:00.048,GOOG,720.77,100,720.5,720.93
3,2016-05-25 13:30:00.048,GOOG,720.92,100,720.5,720.93
4,2016-05-25 13:30:00.048,AAPL,98.0,100,,


## Exercise 33 - Financial data

In [57]:
display_side_by_side(trades, quotes, titles=["trades", "quotes"])

Unnamed: 0,time,ticker,price,quantity
0,2016-05-25 13:30:00.023,MSFT,51.95,75
1,2016-05-25 13:30:00.038,MSFT,51.95,155
2,2016-05-25 13:30:00.048,GOOG,720.77,100
3,2016-05-25 13:30:00.048,GOOG,720.92,100
4,2016-05-25 13:30:00.048,AAPL,98.0,100

Unnamed: 0,time,ticker,bid,ask
0,2016-05-25 13:30:00.023,GOOG,720.5,720.93
1,2016-05-25 13:30:00.023,MSFT,51.95,51.96
2,2016-05-25 13:30:00.030,MSFT,51.97,51.98
3,2016-05-25 13:30:00.041,MSFT,51.99,52.0
4,2016-05-25 13:30:00.048,GOOG,720.5,720.93
5,2016-05-25 13:30:00.049,AAPL,97.99,98.01
6,2016-05-25 13:30:00.072,GOOG,720.5,720.88
7,2016-05-25 13:30:00.075,MSFT,52.01,52.03


In [58]:
merged_df = pd.merge_asof(
    trades, 
    quotes, 
    on="time", 
    by="ticker",
    allow_exact_matches=False
)

merged_df

Unnamed: 0,time,ticker,price,quantity,bid,ask
0,2016-05-25 13:30:00.023,MSFT,51.95,75,,
1,2016-05-25 13:30:00.038,MSFT,51.95,155,51.97,51.98
2,2016-05-25 13:30:00.048,GOOG,720.77,100,720.5,720.93
3,2016-05-25 13:30:00.048,GOOG,720.92,100,720.5,720.93
4,2016-05-25 13:30:00.048,AAPL,98.0,100,,


## Exercise 34 - merge_ordered

* Perform a merge for ordered data with optional filling/interpolation.

In [59]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB


In [60]:
merged_df = pd.merge_ordered(df1, df2)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB

Unnamed: 0,time,left_value,group,right_value
0,2022-12-09 00:00:00,0.3,AA,0.47
1,2022-12-09 00:00:02,0.04,AA,
2,2022-12-09 00:00:03,,AA,0.72
3,2022-12-09 00:00:04,0.62,AA,
4,2022-12-09 00:00:06,0.7,AA,0.39
5,2022-12-09 00:00:08,0.43,BB,
6,2022-12-09 00:00:09,,BB,0.46
7,2022-12-09 00:00:10,0.72,BB,
8,2022-12-09 00:00:12,0.28,BB,0.18
9,2022-12-09 00:00:15,,BB,0.25


## Exercise 35 - merge_ordered - fill_method parameter

* Interpolation method for data.
* Default value is None.
* The only option if ffill.

In [61]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB


In [62]:
merged_df = pd.merge_ordered(df1, df2)

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB

Unnamed: 0,time,left_value,group,right_value
0,2022-12-09 00:00:00,0.3,AA,0.47
1,2022-12-09 00:00:02,0.04,AA,
2,2022-12-09 00:00:03,,AA,0.72
3,2022-12-09 00:00:04,0.62,AA,
4,2022-12-09 00:00:06,0.7,AA,0.39
5,2022-12-09 00:00:08,0.43,BB,
6,2022-12-09 00:00:09,,BB,0.46
7,2022-12-09 00:00:10,0.72,BB,
8,2022-12-09 00:00:12,0.28,BB,0.18
9,2022-12-09 00:00:15,,BB,0.25


In [63]:
merged_df = pd.merge_ordered(df1, df2, fill_method="ffill")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB

Unnamed: 0,time,left_value,group,right_value
0,2022-12-09 00:00:00,0.3,AA,0.47
1,2022-12-09 00:00:02,0.04,AA,0.47
2,2022-12-09 00:00:03,0.04,AA,0.72
3,2022-12-09 00:00:04,0.62,AA,0.72
4,2022-12-09 00:00:06,0.7,AA,0.39
5,2022-12-09 00:00:08,0.43,BB,0.39
6,2022-12-09 00:00:09,0.43,BB,0.46
7,2022-12-09 00:00:10,0.72,BB,0.46
8,2022-12-09 00:00:12,0.28,BB,0.18
9,2022-12-09 00:00:15,0.28,BB,0.25


## Exercise 36 - merge_ordered - left_by and right_by parameters

* left_by: Group left DataFrame by group columns and merge piece by piece with right DataFrame.

* right_by: Group right DataFrame by group columns and merge piece by piece with left DataFrame.

In [64]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB


In [65]:
merged_df = pd.merge_ordered(df1, df2, fill_method="ffill", left_by="group")

display_side_by_side(df1, df2, merged_df, titles=["df1", "df2", "merged_df"])

Unnamed: 0,time,left_value,group
0,2022-12-09 00:00:00,0.3,AA
1,2022-12-09 00:00:02,0.04,AA
2,2022-12-09 00:00:04,0.62,AA
3,2022-12-09 00:00:06,0.7,AA
4,2022-12-09 00:00:08,0.43,BB
5,2022-12-09 00:00:10,0.72,BB
6,2022-12-09 00:00:12,0.28,BB

Unnamed: 0,time,right_value,group
0,2022-12-09 00:00:00,0.47,AA
1,2022-12-09 00:00:03,0.72,AA
2,2022-12-09 00:00:06,0.39,AA
3,2022-12-09 00:00:09,0.46,BB
4,2022-12-09 00:00:12,0.18,BB
5,2022-12-09 00:00:15,0.25,BB

Unnamed: 0,time,left_value,group,right_value
0,2022-12-09 00:00:00,0.3,AA,0.47
1,2022-12-09 00:00:02,0.04,AA,0.47
2,2022-12-09 00:00:03,0.04,AA,0.72
3,2022-12-09 00:00:04,0.62,AA,0.72
4,2022-12-09 00:00:06,0.7,AA,0.39
5,2022-12-09 00:00:08,0.43,BB,
6,2022-12-09 00:00:09,0.43,BB,0.46
7,2022-12-09 00:00:10,0.72,BB,0.46
8,2022-12-09 00:00:12,0.28,BB,0.18
9,2022-12-09 00:00:15,0.28,BB,0.25
