# Combining DataFrames

## Side-by-side

<img src="Assets/concat_1.png" class="juno_ui_theme_light" style="width:600px">

## One top of each other

<img src="Assets/concat_2.png" class="juno_ui_theme_light" style="width:500px">

In [1]:
import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=["A","B","C"])
df2 = pd.DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=["D","E","F"])

In [2]:
# Display DataFrames side-by-side
# Search for Jupyter notebook display two pandas tables side by side

from IPython.display import display_html
from itertools import chain,cycle

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [3]:
display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
0,6,4,1
1,2,6,7
2,2,5,3


## Exercise 1

* Concatenate along columns (axis=1)

In [4]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,D,E,F
0,4,1,1,6,4,1
1,5,1,1,2,6,7
2,4,5,3,2,5,3


In [5]:
combined_df = pd.concat([df1, df2], axis=1)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
0,6,4,1
1,2,6,7
2,2,5,3

Unnamed: 0,A,B,C,D,E,F
0,4,1,1,6,4,1
1,5,1,1,2,6,7
2,4,5,3,2,5,3


## Exercise 2

In [6]:
df2 = pd.DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=["D","E","F"], index=[3, 4, 5])

df2

Unnamed: 0,D,E,F
3,4,1,3
4,1,8,9
5,8,3,2


In [7]:
combined_df = pd.concat([df1, df2], axis=1)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
3,4,1,3
4,1,8,9
5,8,3,2

Unnamed: 0,A,B,C,D,E,F
0,4.0,1.0,1.0,,,
1,5.0,1.0,1.0,,,
2,4.0,5.0,3.0,,,
3,,,,4.0,1.0,3.0
4,,,,1.0,8.0,9.0
5,,,,8.0,3.0,2.0


## Exercise 3

In [8]:
df2 = pd.DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=["D","E","F"], index=[2, 3, 4])

df2

Unnamed: 0,D,E,F
2,9,9,4
3,9,5,3
4,9,8,4


In [9]:
combined_df = pd.concat([df1, df2], axis=1)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
2,9,9,4
3,9,5,3
4,9,8,4

Unnamed: 0,A,B,C,D,E,F
0,4.0,1.0,1.0,,,
1,5.0,1.0,1.0,,,
2,4.0,5.0,3.0,9.0,9.0,4.0
3,,,,9.0,5.0,3.0
4,,,,9.0,8.0,4.0


## Exercise 4

In [10]:
combined_df = pd.concat([df1, df2], axis=1, join="inner")

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
2,9,9,4
3,9,5,3
4,9,8,4

Unnamed: 0,A,B,C,D,E,F
2,4,5,3,9,9,4


## Exercise 5

* The default value of the join parameter is outer.

In [11]:
combined_df = pd.concat([df1, df2], axis=1, join="outer")

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
2,9,9,4
3,9,5,3
4,9,8,4

Unnamed: 0,A,B,C,D,E,F
0,4.0,1.0,1.0,,,
1,5.0,1.0,1.0,,,
2,4.0,5.0,3.0,9.0,9.0,4.0
3,,,,9.0,5.0,3.0
4,,,,9.0,8.0,4.0


## Exercise 6

* Concatenate along rows (axis=0)
* The default value of the axis parameter is 0

In [12]:
combined_df = pd.concat([df1, df2])

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,D,E,F
2,9,9,4
3,9,5,3
4,9,8,4

Unnamed: 0,A,B,C,D,E,F
0,4.0,1.0,1.0,,,
1,5.0,1.0,1.0,,,
2,4.0,5.0,3.0,,,
2,,,,9.0,9.0,4.0
3,,,,9.0,5.0,3.0
4,,,,9.0,8.0,4.0


In [13]:
df3 = df2.rename(columns={"D":"A", "E":"B", "F":"C"})

combined_df = pd.concat([df1, df3])

display_side_by_side(df1, df3, combined_df, titles=["df1", "df3", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,A,B,C
2,9,9,4
3,9,5,3
4,9,8,4

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3
2,9,9,4
3,9,5,3
4,9,8,4


## Exercise 7

In [14]:
df2 = pd.DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=["B","C","D"], index=[2, 3, 4])

df2

Unnamed: 0,B,C,D
2,6,1,7
3,8,7,1
4,5,7,7


In [15]:
combined_df = pd.concat([df1, df2])

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,B,C,D
2,6,1,7
3,8,7,1
4,5,7,7

Unnamed: 0,A,B,C,D
0,4.0,1,1,
1,5.0,1,1,
2,4.0,5,3,
2,,6,1,7.0
3,,8,7,1.0
4,,5,7,7.0


## Exercise 8

In [16]:
combined_df = pd.concat([df1, df2], join="inner")

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C
0,4,1,1
1,5,1,1
2,4,5,3

Unnamed: 0,B,C,D
2,6,1,7
3,8,7,1
4,5,7,7

Unnamed: 0,B,C
0,1,1
1,1,1
2,5,3
2,6,1
3,8,7
4,5,7


## Exercise 9

In [17]:
df1 = pd.DataFrame(np.random.randint(0, 10, size=(3, 4)), columns=["A","B","C","D"])
df2 = pd.DataFrame(np.random.randint(0, 10, size=(3, 4)), columns=["A","B","C","D"])

In [18]:
combined_df = pd.concat([df1, df2])

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4

Unnamed: 0,A,B,C,D
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1


In [19]:
combined_df = pd.concat([df1, df2]).reset_index(drop=True)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4

Unnamed: 0,A,B,C,D
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4
3,4,9,3,5
4,7,0,8,7
5,4,4,9,1


## Exercise 10

In [20]:
combined_df = pd.concat([df1, df2], ignore_index=True)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4

Unnamed: 0,A,B,C,D
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4
3,4,9,3,5
4,7,0,8,7
5,4,4,9,1


## Exercise 11

In [21]:
df3 = pd.DataFrame(np.random.randint(0, 10, size=(3, 4)), columns=["A","B","C","D"])

df3

Unnamed: 0,A,B,C,D
0,2,6,8,5
1,4,9,8,4
2,0,2,9,2


In [22]:
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

display_side_by_side(df1, df2, df3, combined_df, titles=["df1", "df2", "df3", "combined_df"])

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4

Unnamed: 0,A,B,C,D
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1

Unnamed: 0,A,B,C,D
0,2,6,8,5
1,4,9,8,4
2,0,2,9,2

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4
3,4,9,3,5
4,7,0,8,7
5,4,4,9,1
6,2,6,8,5
7,4,9,8,4
8,0,2,9,2


## Exercise 12

* We can label DataFrames so that we know which part comes from which DataFrame. We just pass the list of combined DataFrames in order using keys parameter.

In [23]:
combined_df = pd.concat([df1, df2], keys=["df1", "df2"])

In [24]:
combined_df

Unnamed: 0,Unnamed: 1,A,B,C,D
df1,0,7,1,0,9
df1,1,0,8,6,4
df1,2,8,4,7,4
df2,0,4,9,3,5
df2,1,7,0,8,7
df2,2,4,4,9,1


## Exercise 13

* We can select each part using the `loc` method.


In [25]:
combined_df.loc["df1"]

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4


In [26]:
combined_df.loc["df2"]

Unnamed: 0,A,B,C,D
0,4,9,3,5
1,7,0,8,7
2,4,4,9,1


## Exercise 14

In [27]:
combined_df

Unnamed: 0,Unnamed: 1,A,B,C,D
df1,0,7,1,0,9
df1,1,0,8,6,4
df1,2,8,4,7,4
df2,0,4,9,3,5
df2,1,7,0,8,7
df2,2,4,4,9,1


In [28]:
combined_df.loc[("df1", 1)]

A    0
B    8
C    6
D    4
Name: (df1, 1), dtype: int64

## Exercise 15

* Keys won't work when ignore_index is True

In [29]:
combined_df = pd.concat([df1, df2], keys=["df1", "df2"], ignore_index=True)

combined_df

Unnamed: 0,A,B,C,D
0,7,1,0,9
1,0,8,6,4
2,8,4,7,4
3,4,9,3,5
4,7,0,8,7
5,4,4,9,1


## Exercise 16

In [30]:
df1 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))
df1.iloc[[2, 4], [1, 3, 5]] = np.nan

df2 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))

display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C,D,E,F
0,2,6.0,0,5.0,8,0.0
1,6,9.0,4,3.0,9,0.0
2,6,,2,,9,
3,8,4.0,8,5.0,6,2.0
4,8,,9,,9,

Unnamed: 0,A,B,C,D,E,F
0,8,7,2,3,5,4
1,3,7,1,8,0,8
2,3,3,3,9,4,1
3,2,2,5,7,3,1
4,9,8,6,7,8,9


* The `combine_first` function updates NaN values with the values in the same position of the other DataFrame.

In [31]:
combined_df = df1.combine_first(df2)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D,E,F
0,2,6.0,0,5.0,8,0.0
1,6,9.0,4,3.0,9,0.0
2,6,,2,,9,
3,8,4.0,8,5.0,6,2.0
4,8,,9,,9,

Unnamed: 0,A,B,C,D,E,F
0,8,7,2,3,5,4
1,3,7,1,8,0,8
2,3,3,3,9,4,1
3,2,2,5,7,3,1
4,9,8,6,7,8,9

Unnamed: 0,A,B,C,D,E,F
0,2,6.0,0,5.0,8,0.0
1,6,9.0,4,3.0,9,0.0
2,6,3.0,2,9.0,9,1.0
3,8,4.0,8,5.0,6,2.0
4,8,8.0,9,7.0,9,9.0


## Exercise 17

In [32]:
df1 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))
df1.iloc[[2, 4], [1, 3, 5]] = np.nan

df2 = pd.DataFrame(np.random.randint(0, 10, size=(4, 6)), columns=list("ABCDEF"))

display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C,D,E,F
0,7,5.0,3,1.0,3,2.0
1,0,4.0,4,0.0,7,5.0
2,0,,7,,8,
3,9,7.0,1,9.0,9,5.0
4,7,,6,,3,

Unnamed: 0,A,B,C,D,E,F
0,8,3,7,5,5,4
1,6,2,7,9,3,1
2,9,4,6,9,9,5
3,7,3,3,8,4,9


In [33]:
combined_df = df1.combine_first(df2)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D,E,F
0,7,5.0,3,1.0,3,2.0
1,0,4.0,4,0.0,7,5.0
2,0,,7,,8,
3,9,7.0,1,9.0,9,5.0
4,7,,6,,3,

Unnamed: 0,A,B,C,D,E,F
0,8,3,7,5,5,4
1,6,2,7,9,3,1
2,9,4,6,9,9,5
3,7,3,3,8,4,9

Unnamed: 0,A,B,C,D,E,F
0,7,5.0,3,1.0,3,2.0
1,0,4.0,4,0.0,7,5.0
2,0,4.0,7,9.0,8,5.0
3,9,7.0,1,9.0,9,5.0
4,7,,6,,3,


## Exercise 18

In [34]:
df1 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))
df2 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))

display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C,D,E,F
0,8,4,9,5,1,4
1,2,2,6,1,4,2
2,1,2,5,2,1,5
3,0,1,8,9,3,0
4,1,6,7,3,5,1

Unnamed: 0,A,B,C,D,E,F
0,1,0,6,9,0,2
1,8,7,7,0,5,1
2,0,4,5,0,7,4
3,5,4,6,6,3,6
4,8,6,2,7,8,5


* The `combine` function does an element-wise comparison based on the given function.

In [35]:
combined_df = df1.combine(df2, np.minimum)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D,E,F
0,8,4,9,5,1,4
1,2,2,6,1,4,2
2,1,2,5,2,1,5
3,0,1,8,9,3,0
4,1,6,7,3,5,1

Unnamed: 0,A,B,C,D,E,F
0,1,0,6,9,0,2
1,8,7,7,0,5,1
2,0,4,5,0,7,4
3,5,4,6,6,3,6
4,8,6,2,7,8,5

Unnamed: 0,A,B,C,D,E,F
0,1,0,6,5,0,2
1,2,2,6,0,4,1
2,0,2,5,0,1,4
3,0,1,6,6,3,0
4,1,6,2,3,5,1


## Exercise 19

In [36]:
df1 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))
df1.iloc[[2, 4], [1, 3, 5]] = np.nan

df2 = pd.DataFrame(np.random.randint(0, 10, size=(5, 6)), columns=list("ABCDEF"))

display_side_by_side(df1, df2, titles=["df1", "df2"])

Unnamed: 0,A,B,C,D,E,F
0,0,3.0,2,6.0,1,3.0
1,6,4.0,9,1.0,8,5.0
2,0,,3,,8,
3,7,4.0,9,4.0,2,1.0
4,5,,3,,8,

Unnamed: 0,A,B,C,D,E,F
0,3,6,3,8,0,4
1,0,5,5,5,5,3
2,9,7,2,9,6,0
3,5,3,2,9,2,3
4,9,9,4,1,1,6


* If one of the values is NaN (i.e. missing value), the combined DataFrame at this position has NaN as well because Pandas can’t compare a value with a missing value.

In [37]:
combined_df = df1.combine(df2, np.minimum)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D,E,F
0,0,3.0,2,6.0,1,3.0
1,6,4.0,9,1.0,8,5.0
2,0,,3,,8,
3,7,4.0,9,4.0,2,1.0
4,5,,3,,8,

Unnamed: 0,A,B,C,D,E,F
0,3,6,3,8,0,4
1,0,5,5,5,5,3
2,9,7,2,9,6,0
3,5,3,2,9,2,3
4,9,9,4,1,1,6

Unnamed: 0,A,B,C,D,E,F
0,0,3.0,2,6.0,0,3.0
1,0,4.0,5,1.0,5,3.0
2,0,,2,,6,
3,5,3.0,2,4.0,2,1.0
4,5,,3,,1,


* We can choose a constant value to be used in the case of missing values by using the fill_value parameter. Missing values are filled with this value before comparing them to the values in the other DataFrame.

In [38]:
combined_df = df1.combine(df2, np.minimum, fill_value=3)

display_side_by_side(df1, df2, combined_df, titles=["df1", "df2", "combined_df"])

Unnamed: 0,A,B,C,D,E,F
0,0,3.0,2,6.0,1,3.0
1,6,4.0,9,1.0,8,5.0
2,0,,3,,8,
3,7,4.0,9,4.0,2,1.0
4,5,,3,,8,

Unnamed: 0,A,B,C,D,E,F
0,3,6,3,8,0,4
1,0,5,5,5,5,3
2,9,7,2,9,6,0
3,5,3,2,9,2,3
4,9,9,4,1,1,6

Unnamed: 0,A,B,C,D,E,F
0,0,3.0,2,6.0,0,3.0
1,0,4.0,5,1.0,5,3.0
2,0,3.0,2,3.0,6,0.0
3,5,3.0,2,4.0,2,1.0
4,5,3.0,3,1.0,1,3.0
