# pandas: Python Data Analysis Library

*Data Science Training at Urban*

*Python, class 4, 4/13/2017*

*by Jeff Levy (jlevy@urban.org)*

------

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv('some_cities.csv')
df

In [None]:
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

In [None]:
df[:5]

----

**Creating New Columns, Map, Apply and ApplyMap**

In [None]:
df['citystate'] = df['city'] + ', ' + df['state']
df[:5]

In [None]:
df['v1+v2'] = df['value1'] + df['value2']
df[:5]

In [None]:
df['is_michigan_1'] = df['citystate'].str.endswith('MI')
df['is_michigan_2'] = df['citystate'].str.contains('MI')
df['is_michigan_4'] = df['state'] == 'MI'
df[:5]

In [None]:
#applymap: Perform a function on every ELEMENT in the specified DATAFRAME
df[['value1', 'value2']] = df[['value1', 'value2']].applymap(lambda i: '${:.2f}'.format(i))
df[:5]

In [None]:
#map: perform a function on every ELEMENT in a COLUMN or ROW
df['v1+v2'] = df['v1+v2'].map(lambda i: round(i, 1))
df[:5]

In [None]:
#apply: perform a function across an ENTIRE row or column
def classification(row):
    if row['city'] in ['Lansing', 'Springfield', 'Sacremento', 'Harrisburg', 'Washington']:
        kind = ' Capital City'
    else:
         kind = ' Major City'
    
    if row['state'] in ['MI', 'IL']:
        region = 'Midwest'
    elif row['state'] in ['PA']:
        region = 'Northeast'
    elif row['state'] in ['DC']:
        region = 'South'
    elif row['state'] in ['CA']:
        region = 'West'
    else:
        region = 'Another Planet I Think??'
        
    return region + kind
    
df['classification'] = df.apply(lambda r: classification(r), axis=1)
df[:5]

----

**Pandas Groupby**


In [None]:
group = df.groupby(['state', 'date'])

In [None]:
group

In [None]:
for name, grp in group:
    print(name)
    print(grp)
    break

In [None]:
from datetime import datetime
group.get_group(('CA',datetime(2005, 1, 1)))

In [None]:
group.mean()

In [None]:
group.std()

In [None]:
group.agg({'v1+v2':np.mean})

In [None]:
df.groupby('state').apply(max)['v1+v2']

In [None]:
df.groupby('state').apply(lambda g: g['v1+v2'].max())

----

** Time Series **

In [None]:
dates = pd.date_range('1/1/2000', periods=365, freq='D')
vals = np.random.normal(0, 1, 365)
df = pd.DataFrame({'vals':vals, 'vals2':vals*2}, index=dates)
df.index.name = 'date'
df[:5]

In [None]:
df.resample('M').sum()[:20]

In [None]:
df['delta'] = df['vals'] - df['vals'].shift(1)
df[:10]

** Time Series with Groups **

In [None]:
people = ['person1']*365 + ['person2']*365 + ['person3']*365

dates = pd.date_range('1/1/2000', periods=365, freq='D')
dates = list(dates)*3

vals = np.random.normal(0, 1, 365*3)
vals2 = vals*2

df = pd.DataFrame({'person':people, 'vals':vals, 'vals2':vals2}, index=dates)

df[:20]

In [None]:
df.groupby('person').head(1)

In [None]:
df.groupby('person').max()

In [None]:
df.groupby('person').resample('Q').mean()

---

** Pivot Long to Wide **

In [None]:
df_piv = df.pivot(columns='person')
df_piv[:5]

---

**Practice **

Now try using the dataset from the homework, GSS2016.csv, and playing with groupby.  You can also try changing the index, subsetting or aggregating.  Try things out, and ask questions if you get suck!