# Merge two dataframes using Pandas

In [1]:
import pandas as pd
import numpy as np

# Hierarchical Indexing
It is the use of two indexes on an axis.

In [2]:
srr = pd.Series(np.random.randn(8), index = [['a', 'a', 'a', 'b', 'c', 'c', 'd', 'd'], 
                                               [1, 2, 5, 6, 4, 8, 7, 3]])
srr

# with the help of hierarchical indexing you can use 'partial' indexing (selecting a subset of data)

srr[['d','c']]
srr['a':'b']
srr.loc[['d','c']]  # its the same as the first parsing


# you can convert a hierarchical series into a DataFrame by using the unstack method
df = srr.unstack()
# you can convert the same again using 'stack()' method
df.stack()



a  1    0.045634
   2   -1.427980
   5   -1.709887
b  6    0.794658
c  4   -0.404283
   8    0.576762
d  3   -0.514015
   7   -0.233811
dtype: float64

In [3]:
# In a DataFrame either of the axis can have hierarchical indexes
df = pd.DataFrame(np.arange(8).reshape(4,2), index = [['a', 'a', 'b', 'c'], [1, 2, 2, 4]],
                 columns = [['Ohio', 'California'], ['Green', 'Red']])
df.index.names = ['Key1', 'Key2']
df.columns.names = ['States', 'Colours']
# 'index.names' and 'columns.names' explicitly changes the dataframe object, similar to the 
# 'inplace = True' atrribute

# 'swaplevel' takes two parameters (indexes) and can swap the indexes on a axis
df.swaplevel('Key1', 'Key2') # but only performing swaplevel, the order is unaltered, hence,
# "sort_index()" fucntion is used. it takes 'level' as a parameter, indicating the index number of 
# the indices and sorts accordingly.
df.swaplevel('Key1', 'Key2').sort_index(level = 0)   # level = 1 :  is the second index

# swaplevel also takes index number as parameters instead of the index names
print(df)
df.sum(level = 1)   # sum() take level as parameter and sums up all the elements in that index

# if you want to summarize on the columns axis, you have to explicitly mention 'axis = 1'
df.sum(level = 'Colours', axis = 1)

States     Ohio California
Colours   Green        Red
Key1 Key2                 
a    1        0          1
     2        2          3
b    2        4          5
c    4        6          7


Unnamed: 0_level_0,Colours,Green,Red
Key1,Key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,2,3
b,2,4,5
c,4,6,7


# Merging on Database

*pandas.merge* is used to merge rows in a DataFrame using one or more keys. It is the same as the SQL join operation

*pandas.concat* is used to stack objects together on an axis

In [4]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2' : range(3)})
print(df2)
pd.merge(df1, df2)  # note that 'a' and 'b' are similar on both databases

# it is good practice to mention the column as by default pandas take the overlapping columns as the 
# keys to merge the data, hence
 
pd.merge(df1, df2, on='key')  # 'on' attribute is used to mention the column name to be merged on

# if the elements are different, you can specify the columns
# pd.merge(df1, df2, left_on = 'lkey', right_on = 'rkey')


# 'merge' takes the intersection of the two databases mentioned. if you want the union of the two,

pd.merge(df1, df2, how = 'outer')

# how argument has 'inner' (default), 'left' (to use all key combinations on the left table)
# 'right' (to use all key combinations on the right table) and 'outer' (union of tables)

# This^^ form of merging in the SQL side is known as MANY-TO-ONE


  key  data2
0   a      0
1   b      1
2   d      2


Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


# Data Combination Method (Concatenating along an Axis)

* NumPys 'concatenate' fuction can do this with the NumPy arrays

In [5]:
arr = np.arange(20).reshape(5, 4)
np.concatenate([arr, arr], axis = 1)

# while using a Series or DataFrame object the 'concat' function is used to merge all the columns 
# together, for example in three Series with no similar column names or keys, all the elements are
# merged together

s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3]) # the parameter should be an iterable

# by default concat using the axis 0 to concat the series/tables producing another Series
# if you pass axis = 1 explicitly it will turn into a DataFrame instead

pd.concat([s1, s2, s3], axis = 1).T

Unnamed: 0,a,b,c,d,e,f,g
0,0.0,1.0,,,,,
1,,,2.0,3.0,4.0,,
2,,,,,,5.0,6.0


# Coursera: More Data Processing on Pandas

In [6]:
stff_df = pd.DataFrame([{'Name': 'Kelly', 'Age': 30},{'Name': 'Boston', 'Age': 45},
                       {'Name': 'James', 'Age': 60}])
stff_df = stff_df.set_index('Name')
stu_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'}, {'Name': 'Mike', 'School': 'Law'},
                      {'Name': 'Boston', 'School': 'Engg'}])
stu_df = stu_df.set_index('Name')

print(stff_df)
print(stu_df)

# dataframe_obj.set_index()  &  dataframe_obj.index.name , both of them perform different
# first sets the index as per the 'Name' given in paranthesis(this name should be present as a column)
# second gives a new name to the existing index of the dataframe (it can be any given name)


        Age
Name       
Kelly    30
Boston   45
James    60
          School
Name            
James   Business
Mike         Law
Boston      Engg


In [7]:
pd.merge(stff_df, stu_df, how = 'outer', left_index = True, right_index = True)

# 'left_index'  &  'right_index' are used to indicate that we want to use the index values

pd.merge(stff_df, stu_df, left_index = True, right_index = True)

# by default it is the intersection of the two

Unnamed: 0_level_0,Age,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Boston,45,Engg
James,60,Business


In [8]:
# in order to get join of the two tables based on a single table we use 'how = 'left'' get return all 
# the items of the left table and if there are intersections they will be added on the left table.
# the same is with the right table using "how = 'right'"

pd.merge(stff_df, stu_df, how = 'left', left_index = True, right_index = True)

Unnamed: 0_level_0,Age,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,30,
Boston,45,Engg
James,60,Business


In [9]:
pd.merge(stff_df, stu_df, how = 'right', left_index = True, right_index = True)

# on attribute is the same as left_index = True and right_index = True
# ' on ' takes the name of the column that is common on both of the tables, hence, 
pd.merge(stff_df, stu_df, how = 'right', on = 'Name')  # will yeild the same results

Unnamed: 0_level_0,Age,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Boston,45.0,Engg
James,60.0,Business
Mike,,Law


### In case of conflict of column values
That is, that the column names might be same but the values in the same tables must be different in this case the pandas merge function returns '_x'  &  '_y' to indicate the left and the right columns data respectively.

In [10]:
# for example with the same tables instead of school on table 2, we will add age with a string value

stff_df = pd.DataFrame([{'Name': 'Kelly', 'Age': 30},{'Name': 'Boston', 'Age': 45},
                       {'Name': 'James', 'Age': 60}])

stu_df = pd.DataFrame([{'Name': 'James', 'Age': '44 years'}, {'Name': 'Mike', 'Age': '25 years'},
                      {'Name': 'Boston', 'Age': '24 years'}])

pd.merge(stff_df, stu_df, how = 'left', on = 'Name')

# in the above example, age_x is the left table column and age_y is the right table column of the 
# same column name 'Age' (this is because same people have conflicting values in a same column)



Unnamed: 0,Name,Age_x,Age_y
0,Kelly,30,
1,Boston,45,24 years
2,James,60,44 years


## Muti-indexing and multiple columns
This is done using a list of the columns that are to be matched in the 'on' parameter.
It is necessary that the column names passed in the on parameter has to exist in both the tables.
For example, in the above two tables, it is possible that the first names of the staff and student might match but not the last names, hence in this case muti-indexing and multiple columns are used.

If we add another called named last name, we have to pass a list of the column names in the on parameter.

*pd.merge(stff_df, stu_df, how = 'inner', on = \['First Name', 'Last Name'\])*

# CONCATENATING
### Merging of the tables is done horizontally, while concatenating of the tables is done vertically