In [1]:
import pandas as pd
import numpy as np


In [4]:
"""
    numpy.concatenate()
"""
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

print(np.concatenate([x, y, z]), '\n')
print(np.vstack([x, y, z]), '\n')

x = [[1, 2], # 2D array
     [3, 4]]

# axis=0(row-based), 1(column-based)
np.concatenate([x, x], axis=1)


[1 2 3 4 5 6 7 8 9] 

[[1 2 3]
 [4 5 6]
 [7 8 9]] 



array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [5]:
"""
    pandas.concat()
"""

# Simply concatenate the two Series objects
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])

pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [7]:
# make_df(): create a DataFrame object

def make_df(cols, ind):
    data = {c: [str(c) + str(i) for i in ind]
           for c in cols}

    return pd.DataFrame(data, ind)

# DataFrame example
print(make_df('ABC', [1, 2]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2


In [8]:
# Concatenate the two DataFrame objects

df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])

print(df1, '\n')
print(df2, '\n')

print(pd.concat([df1, df2], axis=1)) # default axis is 0.

    A   B
1  A1  B1
2  A2  B2 

    A   B
3  A3  B3
4  A4  B4 

     A    B    A    B
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
3  NaN  NaN   A3   B3
4  NaN  NaN   A4   B4


In [9]:
# Change the axis to 0.

df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])

print(df3, '\n')
print(df4, '\n')

print(pd.concat([df3, df4], axis=1))

    A   B
0  A0  B0
1  A1  B1 

    C   D
0  C0  D0
1  C1  D1 

    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [10]:

"""
    concate() Features
"""

# Case 1: Duplicate indices

x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])

y.index = x.index # make duplicate indices!

print(x, '\n')
print(y, '\n')

print(pd.concat([x, y]))

    A   B
0  A0  B0
1  A1  B1 

    A   B
0  A2  B2
1  A3  B3 

    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [11]:
# Case 2: Catching duplications as errors

try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError: ", e)

ValueError:  Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [12]:
# Case 3: Ignore the index:
# assign integer-based indices automatically

pd.concat([x, y], ignore_index=True)


Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [13]:

# Case 4: Adding multiindex keys (hierarchical indexing)

pd.concat([x, y], keys=['x', 'y'])


Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


In [14]:
"""
    concat(): join options
"""

x = make_df('ABC', [1, 2])
y = make_df('BCD', [3, 4])

# default: outer join -> union

print(x, '\n')
print(y, '\n')
print(pd.concat([x, y], join='outer'))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2 

    B   C   D
3  B3  C3  D3
4  B4  C4  D4 

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [18]:
# inner join -> intersection

df5 = make_df('AB', [0, 1])
df6 = make_df('CD', [0, 1])

print(df5, '\n')
print(df6, '\n')
print(pd.concat([df5, df6], join='inner'))



    A   B
0  A0  B0
1  A1  B1 

    C   D
0  C0  D0
1  C1  D1 

Empty DataFrame
Columns: []
Index: [0, 1, 0, 1]


In [19]:
# append()

print(df1); print()
print(df2); print()
print(df1.append(df2))


    A   B
1  A1  B1
2  A2  B2

    A   B
3  A3  B3
4  A4  B4

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [20]:
"""
    Join operations
"""

# Prepare the two DataFrame objects
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                   'hire_date': [2004, 2008, 2012, 2014]})

print(df1, '\n')
print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


In [21]:
# One-to-one joins

df3 = pd.merge(df1, df2)

print(df3)


# Many-to-one joins

df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                   'supervisor': ['Carly', 'Guido', 'Steve']})

print(df3, '\n')
print(df4, '\n')

print(pd.merge(df3, df4))

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014 

         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve 

  employee        group  hire_date supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve
