# Lecture07 Manipulating Data and Joins

In [45]:
# Includes and Standard Magic...
### Standard Magic and startup initializers.

# Load Numpy
import numpy as np
# Load MatPlotLib
import matplotlib
import matplotlib.pyplot as plt
# Load Pandas
import pandas as pd

# This lets us show plots inline and also save PDF plots if we want them
%matplotlib inline
from matplotlib.backends.backend_pdf import PdfPages
matplotlib.style.use('fivethirtyeight')

# These two things are for Pandas, it widens the notebook and lets us display data easily.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Show a ludicrus number of rows and columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Importance of using np.nan

Note that we have to import numpy.nan but once we do we can use the groupby and other methods without having to worry about what to do with missing data.

For the example below, what happens when we don't use NaNs in the data table?

In [46]:
# Make a data frame from a lists
# Try replacing 'XX' with various values..

df = pd.DataFrame({'age':     [12.2, 11.0, 15.6, '--'],
                  'wgt_kg':   [42.3, 40.8, 65.3, 84.2],
                  'hgt_cm':   [145.1, 143.8, 165.3, 185.8],
                  'sex':      ['male', 'female', 'male', 'male'],
                  'country': ['USA', 'AUS', 'EU', 'USA']})
df

Unnamed: 0,age,wgt_kg,hgt_cm,sex,country
0,12.2,42.3,145.1,male,USA
1,11,40.8,143.8,female,AUS
2,15.6,65.3,165.3,male,EU
3,--,84.2,185.8,male,USA


In [47]:
df['age'].describe()

count      4.0
unique     4.0
top       15.6
freq       1.0
Name: age, dtype: float64

In [48]:
df.dtypes

age         object
wgt_kg     float64
hgt_cm     float64
sex         object
country     object
dtype: object

In [None]:
df['age'].sum()

## The Groupby Command

Below we see what happens in Pandas when we use the groupby command.

In [142]:
df = pd.DataFrame({'age':     [12.2, 11.0, 15.6, 35.1],
                  'wgt_kg':   [42.3, 40.8, 65.3, 84.2],
                  'hgt_cm':   [145.1, 143.8, 165.3, 185.8],
                  'sex':      ['male', 'female', 'male', 'male'],
                  'country': ['USA', 'AUS', 'EU', 'USA']})
df

Unnamed: 0,age,wgt_kg,hgt_cm,sex,country
0,12.2,42.3,145.1,male,USA
1,11.0,40.8,143.8,female,AUS
2,15.6,65.3,165.3,male,EU
3,35.1,84.2,185.8,male,USA


In [143]:
df.groupby(['sex']).describe()

# ?? Try describing it..?

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
female,1.0,11.0,,11.0,11.0,11.0,11.0,11.0,1.0,40.8,,40.8,40.8,40.8,40.8,40.8,1.0,143.8,,143.8,143.8,143.8,143.8,143.8
male,3.0,20.966667,12.357319,12.2,13.9,15.6,25.35,35.1,3.0,63.933333,20.983406,42.3,53.8,65.3,74.75,84.2,3.0,165.4,20.350184,145.1,155.2,165.3,175.55,185.8


In [144]:
# We can pass operators to groupby to get better results.
df.groupby(['sex']).sum()

Unnamed: 0_level_0,age,wgt_kg,hgt_cm
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,11.0,40.8,143.8
male,62.9,191.8,496.2


In [145]:
# Can also group by multiple columns
df.groupby(['sex', 'country']).describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,age,age,age,age,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,wgt_kg,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm,hgt_cm
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sex,country,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
female,AUS,1.0,11.0,,11.0,11.0,11.0,11.0,11.0,1.0,40.8,,40.8,40.8,40.8,40.8,40.8,1.0,143.8,,143.8,143.8,143.8,143.8,143.8
male,EU,1.0,15.6,,15.6,15.6,15.6,15.6,15.6,1.0,65.3,,65.3,65.3,65.3,65.3,65.3,1.0,165.3,,165.3,165.3,165.3,165.3,165.3
male,USA,2.0,23.65,16.192745,12.2,17.925,23.65,29.375,35.1,2.0,63.25,29.627774,42.3,52.775,63.25,73.725,84.2,2.0,165.45,28.779246,145.1,155.275,165.45,175.625,185.8


In [146]:
grouped = df.groupby(['sex', 'country'])

To get access to a group that we made, use the [get_group() command](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.groupby.GroupBy.get_group.html).

In [147]:
grouped.groups

{('female', 'AUS'): Int64Index([1], dtype='int64'),
 ('male', 'EU'): Int64Index([2], dtype='int64'),
 ('male', 'USA'): Int64Index([0, 3], dtype='int64')}

In [148]:
grouped.get_group(('female', 'AUS'))

Unnamed: 0,age,wgt_kg,hgt_cm,sex,country
1,11.0,40.8,143.8,female,AUS


We can also get access to the actual tuples and they're types that are generated by the `groupby` commands

In [72]:

grouped.groups

{('female', 'AUS'): Int64Index([1], dtype='int64'),
 ('male', 'EU'): Int64Index([2], dtype='int64'),
 ('male', 'USA'): Int64Index([0, 3], dtype='int64')}

## Joins in Pandas

Let's see a few [joins in Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html) and how to execute them.

**Note:** We can also use the [merge command in pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html) to joins as well.  We'll explore this more fully in the lab.  The key idea is that merge let's us pick columns to do the merge/join while join will always join on the index.

In [51]:
# Careful here! We have to set the index!

df1 = pd.DataFrame([(1, 'foo', 3), (2, 'bar', 2), (3, 'foo', 4), (4, 'foo', 3)],
                  columns=['ID','A', 'B'])
df1.set_index('ID', inplace=True)
display(df1)

df2 = pd.DataFrame([(1, 1.2), (2, 2.5), (3, 2.3), (5, 8.0)], 
                   columns=['ID','C'])
df2.set_index('ID', inplace=True)
display(df2)

Unnamed: 0_level_0,A,B
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,foo,3
2,bar,2
3,foo,4
4,foo,3


Unnamed: 0_level_0,C
ID,Unnamed: 1_level_1
1,1.2
2,2.5
3,2.3
5,8.0


In [52]:
# Inner join
df1.join(df2, how='inner')

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,foo,3,1.2
2,bar,2,2.5
3,foo,4,2.3


In [53]:
# Can do inner, outter, left, right.
df1.join(df2, how='right')

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,foo,3.0,1.2
2,bar,2.0,2.5
3,foo,4.0,2.3
5,,,8.0


If we don't have the same column names we can tell Pandas that we want to join on a specific column.

In [54]:
df1 = pd.DataFrame([(1, 'foo', 3), (2, 'bar', 2), (3, 'foo', 4), (4, 'foo', 3)],
                  columns=['leftid','A', 'B'])
display(df1)

df2 = pd.DataFrame([(1, 1.2), (2, 2.5), (3, 2.3), (5, 8.0)], 
                   columns=['rightid','C'])
df2.set_index('rightid', inplace=True)
display(df2)

Unnamed: 0,leftid,A,B
0,1,foo,3
1,2,bar,2
2,3,foo,4
3,4,foo,3


Unnamed: 0_level_0,C
rightid,Unnamed: 1_level_1
1,1.2
2,2.5
3,2.3
5,8.0


In [55]:
# Inner join
df1.join(df2, how='inner')
# Is this what we wanted?!?!

Unnamed: 0,leftid,A,B,C
1,2,bar,2,1.2
2,3,foo,4,2.5
3,4,foo,3,2.3


In [56]:
# We could have set the index but we can also tell Pandas to do it for us.
df1.join(df2, how='inner', on='leftid')

Unnamed: 0,leftid,A,B,C
0,1,foo,3,1.2
1,2,bar,2,2.5
2,3,foo,4,2.3


## Pandas Join

Or we can skip all this foolishness by using the [pandas merge](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html) command...

In [149]:
# Or we can skip all this foolishness by using the merge command...
df1 = pd.DataFrame([(1, 'foo', 3), (2, 'bar', 2), (3, 'foo', 4), (4, 'foo', 3)],
                  columns=['ID','A', 'B'])
display(df1)

df2 = pd.DataFrame([(1, 1.2), (2, 2.5), (3, 2.3), (5, 8.0)], 
                   columns=['ID','C'])
display(df2)

Unnamed: 0,ID,A,B
0,1,foo,3
1,2,bar,2
2,3,foo,4
3,4,foo,3


Unnamed: 0,ID,C
0,1,1.2
1,2,2.5
2,3,2.3
3,5,8.0


In [156]:
df1.merge(df2, left_on="ID", right_on="ID", how='inner')

Unnamed: 0,ID,A,B,C
0,1,foo,3,1.2
1,2,bar,2,2.5
2,3,foo,4,2.3
