In [1]:
import pandas as pd

In [2]:
df4 = pd.DataFrame([dict(b='mammal', c=5.0, d='horse', e=13),
                    dict(b='mammal', c=1.4, d='cat', e=11),
                    dict(b='mammal', c=1.2, d='dog'),
                    dict(b='fish', c=0.1, d='goldfish'),
                    dict(b='reptile', c=2.3, d='snake', e=9),
                    dict(b='fish', c=1.8, d='koi', e=10),
                    dict(b='fish', c=0.8, d='carp')])

## GroupBy and Aggregation

In [3]:
df4

Unnamed: 0,b,c,d,e
0,mammal,5.0,horse,13.0
1,mammal,1.4,cat,11.0
2,mammal,1.2,dog,
3,fish,0.1,goldfish,
4,reptile,2.3,snake,9.0
5,fish,1.8,koi,10.0
6,fish,0.8,carp,


In [4]:
gb = df4.groupby('b')

In [5]:
gb

<pandas.core.groupby.DataFrameGroupBy object at 0x7fe7bb292d30>

In [6]:
gb.mean()

Unnamed: 0_level_0,c,e
b,Unnamed: 1_level_1,Unnamed: 2_level_1
fish,0.9,10.0
mammal,2.533333,12.0
reptile,2.3,9.0


In [8]:
gb.aggregate({'c': ['min', 'sum']})

Unnamed: 0_level_0,c,c
Unnamed: 0_level_1,min,sum
b,Unnamed: 1_level_2,Unnamed: 2_level_2
fish,0.1,2.7
mammal,1.2,7.6
reptile,2.3,2.3


In [9]:
def func(col):
    return sum(i*col.iloc[i,] for i in range(len(col)))

In [10]:
gb.agg(func)

Unnamed: 0_level_0,c,e
b,Unnamed: 1_level_1,Unnamed: 2_level_1
fish,3.4,
mammal,3.8,
reptile,0.0,0.0


In [11]:
df4

Unnamed: 0,b,c,d,e
0,mammal,5.0,horse,13.0
1,mammal,1.4,cat,11.0
2,mammal,1.2,dog,
3,fish,0.1,goldfish,
4,reptile,2.3,snake,9.0
5,fish,1.8,koi,10.0
6,fish,0.8,carp,


$fish = 0 \cdot 0.1 + 1 \cdot 1.8 + 2 \cdot 0.8 = 3.4$

$reptile = 0 \cdot 2.3 = 0$

$mammal = 0 \cdot 5.0 + 1 \cdot 1.4 + 2 \cdot 1.2 = 3.8$

## Merge, Join, etc.

In [16]:
fn2 = "https://raw.githubusercontent.com/sebwink/Integrated_Bioinformatics/master/example_data/example_df.csv"

In [17]:
df2 = pd.read_csv(fn2, sep=',')

In [18]:
df2

Unnamed: 0,a,b,c,d,e
0,2,3,5.0,horse,13.0
1,3,1,1.4,cat,11.0
2,4,9,1.2,dog,
3,5,12,0.8,tortoise,
4,6,2,2.3,rabbit,9.0


In [19]:
df4

Unnamed: 0,b,c,d,e
0,mammal,5.0,horse,13.0
1,mammal,1.4,cat,11.0
2,mammal,1.2,dog,
3,fish,0.1,goldfish,
4,reptile,2.3,snake,9.0
5,fish,1.8,koi,10.0
6,fish,0.8,carp,


In [20]:
df4.merge(df2, on=['d'])

Unnamed: 0,b_x,c_x,d,e_x,a,b_y,c_y,e_y
0,mammal,5.0,horse,13.0,2,3,5.0,13.0
1,mammal,1.4,cat,11.0,3,1,1.4,11.0
2,mammal,1.2,dog,,4,9,1.2,


In [22]:
df4.merge(df2, on=['d', 'c', 'e'])

Unnamed: 0,b_x,c,d,e,a,b_y
0,mammal,5.0,horse,13.0,2,3
1,mammal,1.4,cat,11.0,3,1
2,mammal,1.2,dog,,4,9


In [23]:
df4.merge(df2, on=['b'])

Unnamed: 0,b,c_x,d_x,e_x,a,c_y,d_y,e_y


In [24]:
pd.merge(df4, df3, on=['d'])

Unnamed: 0,b_x,c_x,d,e_x,a,b_y,c_y,e_y
0,mammal,5.0,horse,13.0,2,3,5.0,13.0
1,mammal,1.4,cat,11.0,3,1,1.4,11.0
2,mammal,1.2,dog,,4,9,1.2,


In [25]:
pd.merge(df4, df3, on=['d'], how='outer')

Unnamed: 0,b_x,c_x,d,e_x,a,b_y,c_y,e_y
0,mammal,5.0,horse,13.0,2.0,3.0,5.0,13.0
1,mammal,1.4,cat,11.0,3.0,1.0,1.4,11.0
2,mammal,1.2,dog,,4.0,9.0,1.2,
3,fish,0.1,goldfish,,,,,
4,reptile,2.3,snake,9.0,,,,
5,fish,1.8,koi,10.0,,,,
6,fish,0.8,carp,,,,,
7,,,tortoise,,5.0,12.0,0.8,
8,,,rabbit,,6.0,2.0,2.3,9.0


In [26]:
df4['a'] = pd.Series(range(len(df4)))

In [27]:
df4

Unnamed: 0,b,c,d,e,a
0,mammal,5.0,horse,13.0,0
1,mammal,1.4,cat,11.0,1
2,mammal,1.2,dog,,2
3,fish,0.1,goldfish,,3
4,reptile,2.3,snake,9.0,4
5,fish,1.8,koi,10.0,5
6,fish,0.8,carp,,6


In [29]:
pd.concat([df4,df2])

Unnamed: 0,a,b,c,d,e
0,0,mammal,5.0,horse,13.0
1,1,mammal,1.4,cat,11.0
2,2,mammal,1.2,dog,
3,3,fish,0.1,goldfish,
4,4,reptile,2.3,snake,9.0
5,5,fish,1.8,koi,10.0
6,6,fish,0.8,carp,
0,2,3,5.0,horse,13.0
1,3,1,1.4,cat,11.0
2,4,9,1.2,dog,


## References

For further information and examples, see https://pandas.pydata.org/pandas-docs/stable/merging.html