In [2]:
# %load command1.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


In [3]:
df_customer = pd.DataFrame({
    'id': [1,2,3,4],
    'name': ['Tom', 'Jenny', 'James', 'Dan'],
})
df_customer

Unnamed: 0,id,name
0,1,Tom
1,2,Jenny
2,3,James
3,4,Dan


In [4]:
df_info = pd.DataFrame({
    'id': [2,3,4,5],
    'age': [31,20,40,70],
    'sex': ['F', 'M', 'M', 'F']
})
df_info

Unnamed: 0,id,age,sex
0,2,31,F
1,3,20,M
2,4,40,M
3,5,70,F


In [7]:
# without any key column
pd.merge(df_customer, df_info)
df_customer.merge(df_info)

Unnamed: 0,id,name,age,sex
0,2,Jenny,31,F
1,3,James,20,M
2,4,Dan,40,M


Unnamed: 0,id,name,age,sex
0,2,Jenny,31,F
1,3,James,20,M
2,4,Dan,40,M


In [8]:
# specigying key columns using arguemnt on

pd.merge(df_customer, df_info, on = 'id')

Unnamed: 0,id,name,age,sex
0,2,Jenny,31,F
1,3,James,20,M
2,4,Dan,40,M


In [9]:
# with multiple key columns
df_order = pd.DataFrame({
    'id': [2,3,4,5],
    'name': ['Jenny', 'James', 'Dan', 'leo'],
    'quantity': [2,4,6,10]
})


pd.merge(df_customer, df_order, on=['id','name'])

Unnamed: 0,id,name,quantity
0,2,Jenny,2
1,3,James,4
2,4,Dan,6


In [10]:
# using left_on and right_on
df_info_2 = pd.DataFrame({
    'customer_id': [2,3,4,5],
    'age': [31,20,40,70],
    'sex': ['F', 'M', 'M', 'F']
})
df_info_2

Unnamed: 0,customer_id,age,sex
0,2,31,F
1,3,20,M
2,4,40,M
3,5,70,F


In [11]:
df_customer

Unnamed: 0,id,name
0,1,Tom
1,2,Jenny
2,3,James
3,4,Dan


In [12]:
pd.merge(df_customer, df_info_2, left_on='id', right_on='customer_id')

Unnamed: 0,id,name,customer_id,age,sex
0,2,Jenny,2,31,F
1,3,James,3,20,M
2,4,Dan,4,40,M


In [15]:
# join option how

df_customer = pd.DataFrame({
    'id': [1,2,3,4],
    'name': ['Tom', 'Jenny', 'James', 'Dan'],
})

df_customer

df_info = pd.DataFrame({
    'id': [2,3,4,5],
    'age': [31,20,40,70],
    'sex': ['F', 'M', 'M', 'F']
})

df_info

Unnamed: 0,id,name
0,1,Tom
1,2,Jenny
2,3,James
3,4,Dan


Unnamed: 0,id,age,sex
0,2,31,F
1,3,20,M
2,4,40,M
3,5,70,F


In [19]:
# inner
pd.merge(df_customer, df_info, how='inner', on='id') # pd.merge(df_customer, df_info, on='id')
# left
pd.merge(df_customer, df_info, how='left', on='id')
# right
pd.merge(df_customer, df_info, how='right', on='id')
# outer
pd.merge(df_customer, df_info, how='outer', on='id')

Unnamed: 0,id,name,age,sex
0,2,Jenny,31,F
1,3,James,20,M
2,4,Dan,40,M


Unnamed: 0,id,name,age,sex
0,1,Tom,,
1,2,Jenny,31.0,F
2,3,James,20.0,M
3,4,Dan,40.0,M


Unnamed: 0,id,name,age,sex
0,2,Jenny,31,F
1,3,James,20,M
2,4,Dan,40,M
3,5,,70,F


Unnamed: 0,id,name,age,sex
0,1,Tom,,
1,2,Jenny,31.0,F
2,3,James,20.0,M
3,4,Dan,40.0,M
4,5,,70.0,F


In [22]:
# Using validate to avoid invalid records
df_customer = pd.DataFrame({
    'id': [1,2,3,4],
    'name': ['Tom', 'Jenny', 'James', 'Dan'],
})
df_customer

df_order_2 = pd.DataFrame({
    'id': [2,2,4,4],
    'product': ['A', 'B' ,'A', 'C'],
    'quantity': [31, 21, 20,40],
    'date': pd.date_range('2019-02-24', periods=4, freq='D')
})
df_order_2

Unnamed: 0,id,name
0,1,Tom
1,2,Jenny
2,3,James
3,4,Dan


Unnamed: 0,id,product,quantity,date
0,2,A,31,2019-02-24
1,2,B,21,2019-02-25
2,4,A,20,2019-02-26
3,4,C,40,2019-02-27


In [23]:
pd.merge(df_customer, df_order_2, how='left', on='id')

Unnamed: 0,id,name,product,quantity,date
0,1,Tom,,,NaT
1,2,Jenny,A,31.0,2019-02-24
2,2,Jenny,B,21.0,2019-02-25
3,3,James,,,NaT
4,4,Dan,A,20.0,2019-02-26
5,4,Dan,C,40.0,2019-02-27


In [25]:
df_customer = pd.DataFrame({
    'id': [1,2,3,4],
    'name': ['Tom', 'Jenny', 'James', 'Dan'],
})


df_info = pd.DataFrame({
    'id': [2,2,3,4,5],
    'age': [31,21, 20,40,70],
    'sex': ['F', 'F' ,'M', 'M', 'F']
})

pd.merge(df_customer, df_info, how='left', on='id')

Unnamed: 0,id,name,age,sex
0,1,Tom,,
1,2,Jenny,31.0,F
2,2,Jenny,21.0,F
3,3,James,20.0,M
4,4,Dan,40.0,M


In [26]:
pd.merge(df_customer, df_info, how='left', on='id', validate='1:1')

MergeError: Merge keys are not unique in right dataset; not a one-to-one merge