In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

# Pandas Combining DataFrames

In pandas there are 4 (plus a few special case) ways to combine data from different frames:

* Merging
* Joining
* Concatenating 
* Appending

Where merging and joining are basically redundant and concatenating and appending are basically redundant. 

So today we will be going over Merging and Concatenating in pandas. 

Check out the full documentation [here](http://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html), but be warned it is a bit long :)


Okay let's get started.

In [2]:
tips = sns.load_dataset('tips')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


## Merge

Merging is for doing complex column-wise combinations of dataframes in a SQL-like way. If you don't know SQL joins then check out this resource [sql joins](https://www.w3schools.com/sql/sql_join.asp) and comment below 

Two merge we need two dataframes, let's make them below:

In [5]:
tips_bill = tips.groupby(['sex', 'smoker'])[['total_bill','tip']].sum()
tips_tip = tips.groupby(['sex', 'smoker'])[['total_bill','tip']].sum()

del tips_bill['tip']
del tips_tip['total_bill']

In [6]:
tips_bill

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill
sex,smoker,Unnamed: 2_level_1
Male,Yes,1337.07
Male,No,1919.75
Female,Yes,593.27
Female,No,977.68


In [7]:
tips_tip

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,183.07
Male,No,302.0
Female,Yes,96.74
Female,No,149.77


Now that we have two datasets that we want to combine (aka take the tips and combine with the total bill), how do we do it? We merge!

In [8]:
pd.merge?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mmerge[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mleft[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'MergeHow'[0m [0;34m=[0m [0;34m'inner'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0

Notice that there are a ton of options:

In [9]:
pd.merge(tips_bill, tips_tip, right_index=True, left_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,1337.07,183.07
Male,No,1919.75,302.0
Female,Yes,593.27,96.74
Female,No,977.68,149.77


In [12]:
pd.merge(tips_bill, tips_tip, right_index=True, left_index=True, how='outer')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,1337.07,183.07
Male,No,1919.75,302.0
Female,Yes,593.27,96.74
Female,No,977.68,149.77


In [13]:
tips_bill.reset_index()

Unnamed: 0,sex,smoker,total_bill
0,Male,Yes,1337.07
1,Male,No,1919.75
2,Female,Yes,593.27
3,Female,No,977.68


In [14]:
tips_tip.reset_index()

Unnamed: 0,sex,smoker,tip
0,Male,Yes,183.07
1,Male,No,302.0
2,Female,Yes,96.74
3,Female,No,149.77


In [15]:
pd.merge(
    tips_bill.reset_index(),
    tips_tip.reset_index(),
    on=['sex', 'smoker']
) # Now it looks like SQL

Unnamed: 0,sex,smoker,total_bill,tip
0,Male,Yes,1337.07,183.07
1,Male,No,1919.75,302.0
2,Female,Yes,593.27,96.74
3,Female,No,977.68,149.77


In [16]:
tips_bill.reset_index()

Unnamed: 0,sex,smoker,total_bill
0,Male,Yes,1337.07
1,Male,No,1919.75
2,Female,Yes,593.27
3,Female,No,977.68


In [17]:
tips_tip

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,183.07
Male,No,302.0
Female,Yes,96.74
Female,No,149.77


In [18]:
pd.merge(
    tips_bill.reset_index(),
    tips_tip,
    left_on=['sex', 'smoker'],
    right_index=True
) # We merged based on columns of left and index of right

Unnamed: 0,sex,smoker,total_bill,tip
0,Male,Yes,1337.07,183.07
1,Male,No,1919.75,302.0
2,Female,Yes,593.27,96.74
3,Female,No,977.68,149.77


In [19]:
tips_bill_strange = tips_bill.reset_index(level=0)
tips_bill_strange

Unnamed: 0_level_0,sex,total_bill
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,Male,1337.07
No,Male,1919.75
Yes,Female,593.27
No,Female,977.68


In [20]:
tips_tip.reset_index()

Unnamed: 0,sex,smoker,tip
0,Male,Yes,183.07
1,Male,No,302.0
2,Female,Yes,96.74
3,Female,No,149.77


In [22]:
pd.merge(
    tips_tip.reset_index(),
    tips_bill_strange,
    on=['sex', 'smoker']
)

Unnamed: 0,sex,smoker,tip,total_bill
0,Male,Yes,183.07,1337.07
1,Male,No,302.0,1919.75
2,Female,Yes,96.74,593.27
3,Female,No,149.77,977.68


In [24]:
# Left outer merge
display(tips_bill.reset_index()) 
display(tips_tip.reset_index().head(2))
pd.merge(
    tips_bill.reset_index(),
    tips_tip.reset_index().head(2),
    how='left'
)

Unnamed: 0,sex,smoker,total_bill
0,Male,Yes,1337.07
1,Male,No,1919.75
2,Female,Yes,593.27
3,Female,No,977.68


Unnamed: 0,sex,smoker,tip
0,Male,Yes,183.07
1,Male,No,302.0


Unnamed: 0,sex,smoker,total_bill,tip
0,Male,Yes,1337.07,183.07
1,Male,No,1919.75,302.0
2,Female,Yes,593.27,
3,Female,No,977.68,


In [25]:
# inner merge
display(tips_bill.reset_index()) 
display(tips_tip.reset_index().head(2))
pd.merge(
    tips_bill.reset_index(),
    tips_tip.reset_index().head(2),
    how='inner'
)

Unnamed: 0,sex,smoker,total_bill
0,Male,Yes,1337.07
1,Male,No,1919.75
2,Female,Yes,593.27
3,Female,No,977.68


Unnamed: 0,sex,smoker,tip
0,Male,Yes,183.07
1,Male,No,302.0


Unnamed: 0,sex,smoker,total_bill,tip
0,Male,Yes,1337.07,183.07
1,Male,No,1919.75,302.0


In [27]:
display(tips_bill.reset_index().tail(3))
display(tips_tip.reset_index().head(3))
pd.merge(
    tips_bill.reset_index().tail(3),
    tips_tip.reset_index().head(3),
    how='outer',
    indicator=True
)

Unnamed: 0,sex,smoker,total_bill
1,Male,No,1919.75
2,Female,Yes,593.27
3,Female,No,977.68


Unnamed: 0,sex,smoker,tip
0,Male,Yes,183.07
1,Male,No,302.0
2,Female,Yes,96.74


Unnamed: 0,sex,smoker,total_bill,tip,_merge
0,Male,No,1919.75,302.0,both
1,Female,Yes,593.27,96.74,both
2,Female,No,977.68,,left_only
3,Male,Yes,,183.07,right_only


This is one of the most complex parts of pandas - but it is very important to master. So please do check out the excerises below!

One thing to be careful with here is merging two data types. Strings are not equal to ints!

# Contatenation

Concatenating is for combining more than two dataframes in either column-wise or row-wise. The problem with concatenate is that the combinations it allows you to do are rather simplistic. That's why we need merge. 

Concatenate can take as many data frames as you want, but it requires that they are specifically constructed. All of the dataframes you pass in will need to have the same index. So no more using columns as an index. 

Let's check out basic use below:

In [28]:
tips_bill

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill
sex,smoker,Unnamed: 2_level_1
Male,Yes,1337.07
Male,No,1919.75
Female,Yes,593.27
Female,No,977.68


In [29]:
tips_tip

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,183.07
Male,No,302.0
Female,Yes,96.74
Female,No,149.77


In [30]:
pd.concat([tips_bill, tips_bill, tips_tip])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,1337.07,
Male,No,1919.75,
Female,Yes,593.27,
Female,No,977.68,
Male,Yes,1337.07,
Male,No,1919.75,
Female,Yes,593.27,
Female,No,977.68,
Male,Yes,,183.07
Male,No,,302.0


In [31]:
pd.concat([tips_bill, tips_bill, tips_tip], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,Yes,1337.07,1337.07,183.07
Male,No,1919.75,1919.75,302.0
Female,Yes,593.27,593.27,96.74
Female,No,977.68,977.68,149.77


In [38]:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    },
    index=[0, 1, 2, 3],
)
df2 = pd.DataFrame(
    {
        "A": ["A4", "A5", "A6", "A7"],
        "B": ["B4", "B5", "B6", "B7"],
        "C": ["C4", "C5", "C6", "C7"],
        "D": ["D4", "D5", "D6", "D7"],
    },
    index=[4, 5, 6, 7],
)


df3 = pd.DataFrame(
    {
        "A": ["A8", "A9", "A10", "A11"],
        "B": ["B8", "B9", "B10", "B11"],
        "C": ["C8", "C9", "C10", "C11"],
        "D": ["D8", "D9", "D10", "D11"],
    },
    index=[8, 9, 10, 11],
)


frames = [df1, df2, df3]

result = pd.concat(frames, axis=1)

result

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [36]:
result.loc[0, 'A']

0    A0
0    A4
Name: A, dtype: object

As you can see there is not a ton of functionality to concat, but it is invaluable if you have more than one dataframe or you are looking to append the rows of one dataframe onto another.

## Conclusion

There are a couple of other ways to merge data, but they are pretty niche (and mainly for time series data).

They are:

* combine_first
* merge_ordered
* merge_asof
