## Merging Rows to a data frame 
- when need to create row entries, and some are missing


In [1]:
import pandas as pd

df = pd.DataFrame([{'Name': 'Chris', 'Item Purchased': 'Sponge', 'Cost': 22.50},
                   {'Name': 'Kevyn', 'Item Purchased': 'Kitty Litter', 'Cost': 2.50},
                   {'Name': 'Filip', 'Item Purchased': 'Spoon', 'Cost': 5.00}],
                  index=['Store 1', 'Store 1', 'Store 2'])
df

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Sponge,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Spoon,Filip


In [5]:
# Add surname column, but know for 2 people
df['Loc'] = df.index
df.reset_index(inplace= True)
df['Sname'] = pd.Series({0:'Lynn',2:"Tie"})
df

Unnamed: 0,index,Cost,Item Purchased,Name,Loc,Sname
0,Store 1,22.5,Sponge,Chris,Store 1,Lynn
1,Store 1,2.5,Kitty Litter,Kevyn,Store 1,
2,Store 2,5.0,Spoon,Filip,Store 2,Tie


## Joining data frames  
- on index 
- on columns 
- when columns in joined frames have same names, different information


In [6]:
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])
student_df = student_df.set_index('Name')
print(staff_df.head())
print()
print(student_df.head())

                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader

            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [7]:
# Get information on role and school for everyone ; join on names, which is index
pd.merge(staff_df, student_df, how = 'outer',left_index= True, right_index= True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


### Note that joins keep the joining key as unique, and sort them after joining. 
### Need to be careful about joining on index, as it may not be unique

In [8]:
# Join on columns
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')

Unnamed: 0,Name,Role,School
0,Kelly,Director of HR,
1,Sally,Course liasion,Engineering
2,James,Grader,Business


In [11]:
# When dframes have same named non-key columns , two different columns are created
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},
                           {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},
                           {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')

Unnamed: 0,Location_x,Name,Role,Location_y,School
0,State Street,Kelly,Director of HR,,
1,Washington Avenue,Sally,Course liasion,512 Wilson Crescent,Engineering
2,Washington Avenue,James,Grader,1024 Billiard Avenue,Business


# Panda Idioms : Best practices

## 1. Vectorized Code 
- Use vectorized functions instead of loops

## 2. Never Chain Index 
df[['a','b']][df['b'] > 0].    
- example of chain indexing, i.e use of [][], this can unpredictably return 
view or copy, and it is slow

## 3. Method Chaining, Readability

In [14]:
dir = '/Users/sumad/Documents/DS/Python/UM Spcialization/DS_with_Python/'
with open(dir + 'census.csv') as con:
    cs = pd.read_csv(con)

In [15]:
cs.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243286,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [48]:
# Example : filter, drop na, set hierarchical indexcs, rename columns

In [20]:
# This is faster than writing statements, and is readable thanks to wrapping in ()
(cs.where(cs['SUMLEV'] == 50)
 .dropna()
 .set_index(['STNAME','CTYNAME'])
 .rename(columns = {'STATE':'STATE_CODE'})
 .head())

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE_CODE,COUNTY,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54571.0,54660.0,55253.0,55175.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183193.0,186659.0,190396.0,...,14.83296,17.647293,21.845705,19.243286,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27457.0,27341.0,27226.0,27159.0,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22919.0,22861.0,22733.0,22642.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57373.0,57711.0,57776.0,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


## 4. Functional programming
- Use of functional programming to chain operations together 
- typically, use map in python, similar methof is apply for dataframe objects 
-  use of lambda functions

In [41]:
# Compute min max of popultion estimates across years, years being in cols in the DataFrame
import numpy as np
def min_max_pop(row, cols):
    row = row[cols] # col selection
    row['min'] = np.min(row)
    row['max'] = np.max(row)
    return row
    

In [42]:
cols_select = ['POPESTIMATE2010',
       'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013',
       'POPESTIMATE2014', 'POPESTIMATE2015']


In [43]:
from functools import partial  
new_min_max_pop = partial(min_max_pop, cols = cols_select)

In [44]:
df_min_max = cs.apply(new_min_max_pop, axis = 1)

In [45]:
type(df_min_max)

pandas.core.frame.DataFrame

In [46]:
df_min_max.head()

Unnamed: 0,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,min,max
0,4785161,4801108,4816089,4830533,4846411,4858979,4785161,4858979
1,54660,55253,55175,55038,55290,55347,54660,55347
2,183193,186659,190396,195126,199713,203709,183193,203709
3,27341,27226,27159,26973,26815,26489,26489,27341
4,22861,22733,22642,22512,22549,22583,22512,22861


In [51]:
# can use lambda function also
new_df = cs.apply(lambda x : np.min(x[cols_select]), axis = 1)
          
new_df.head()

0    4785161
1      54660
2     183193
3      26489
4      22512
dtype: int64

In [56]:
np.cumproduct?

## Group By 
- groupby splits a dataframe, returns a tuple with a key and a grouped data frame identifiable by key, which is the unique values of column used to group, the grouped dataframe is a split dataframe by key value.  The key and dataframe can be iterated upon.  
  - Instead of a column, a function can be passed that operates on index or column and creates the key to be split on. 
- split-compute-combine :  groupby has a method agg, that takes a dictionary of column and method to apply, then applies the method on column across grouped data frames, and returns a summarized data frame by key, columnname as name of the operation 
  - A column selection can be applied on grouped data frames by simply passing a list of column names on grouped object. grouped data frames can be of two types, grouped Series (if a single col. ios selected)  and grouped dataframes

In [59]:
# Avg. 2010 pop by state
(cs.loc[cs['SUMLEV'] == 50,:]
.groupby('STNAME').agg({'CENSUS2010POP': np.average})
.head())

Unnamed: 0_level_0,CENSUS2010POP
STNAME,Unnamed: 1_level_1
Alabama,71339.343284
Alaska,24490.724138
Arizona,426134.466667
Arkansas,38878.906667
California,642309.586207


In [69]:
# Same Group by bisecting first alphabet of state to 3 parts, construct this as a function, use this function to group by
def state_grp(x):
    if (x[0] < 'E'):
        return 1
    if (x[0] < 'M'):
        return 2
    return 3    
        

cs.set_index('STNAME').groupby(state_grp).agg({'CENSUS2010POP':np.average})

Unnamed: 0,CENSUS2010POP
1,377840.778116
2,151103.787774
3,181356.161242


In [72]:
# Group by, a set of operations on a set of columns. 
## In this case you name the dictionary keys in agg to be the output column of the operation which is value to the key
## argument level in group by, used to group by a level of multi-level index
cs.set_index('STNAME').groupby(level =0)['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012'].agg({'add': np.sum,
                                                                                  'avg' :np.average}).head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,add,add,add,avg,avg,avg
Unnamed: 0_level_1,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012
STNAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Alabama,9570322,9602216,9632178,140740.0,141209.1,141649.7
Alaska,1428042,1445440,1462456,47601.4,48181.33,48748.53
Arizona,12816416,12937464,13106524,801026.0,808591.5,819157.8
Arkansas,5844788,5877076,5898998,76905.11,77329.95,77618.39
California,74668158,75400068,76112110,1265562.0,1277967.0,1290036.0
