In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plot
import matplotlib.pyplot as plt

In [3]:
# We can create a series from a lit
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print("data looks like a numpy array: ", data)

# We can manually specify indexes
data = pd.Series([0.25, 0.5, 0.75, 1.0],        
                index=['a', 'b', 'c', 'd'])
print("data looks like a Python dict: ", data)

print(data['b'])
# We can create a Series directly from a dict:
population_dict = {'California': 38332521,
                    'Texas': 26448193,
                    'New York': 19651127,               
                    'Florida': 19552860,                  
                    'Illinois': 12882135}
area_dict = {'California': 423967,
            'Texas': 695662,
            'New York': 141297,
            'Florida': 170312,
            'Illinois': 149995}

population = pd.Series(population_dict)
area = pd.Series(area_dict)
print(population)
# What do you think of this line?
print(population['California':'Florida'])

data looks like a numpy array:  0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
data looks like a Python dict:  a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64


In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a', 'b', 'c', 'd'])
print(data)

print(data.loc['b'])
print(data.iloc[1])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
0.5


In [5]:
data = pd.DataFrame({'area':area, 'pop':population})
print(data)
data.loc[:'Illinois', :'pop']

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [6]:
def make_df(cols, ind):
  data = {c: [str(c) + str(i) for i in ind]for c in cols}
  return pd.DataFrame(data, ind)

In [7]:
def make_df(cols, ind):
  data = {c: [str(c) + str(i) for i in ind]for c in cols}
  return pd.DataFrame(data, ind)

In [8]:
df1 = make_df('ABC',[1,2])
df2 = make_df('ABC',[3,4])

In [9]:
df1 = make_df('ABC',[1,2])
df2 = make_df('ABC',[3,4])

In [10]:

df1

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2


In [11]:
pd.concat([df1,df2])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4


In [12]:
df1 = make_df('ABC', [1, 2, 3])
df2 = make_df('ABC', [4, 5, 6])
pd.concat([df1, df2])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6


In [13]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  
# Same indexes

print(pd.concat([x, y]))
# We can ask for hierarchical indexes
hdf = pd.concat([x, y], keys=['x', 'y'])
print(hdf)

    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3
      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [14]:
df1 = pd.DataFrame({
    'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
    'department': ['Accounting','Engineering','Engineering', 'HR']})
                    
df2 = pd.DataFrame({
    'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
    'date': [2004, 2008, 2012, 2014]})
    
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,department,date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [15]:
df4 = pd.DataFrame({'department': ['Accounting', 'Engineering', 
                                    'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
pd.merge(df3, df4)

Unnamed: 0,employee,department,date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [16]:
df5 = pd.DataFrame({'department': ['Accounting', 'Accounting',
                                    'Engineering', 'Engineering', 
                                    'HR', 'HR'],    
                    'competence': ['math', 'spreadsheets', 'coding',
                                    'linux',
                                    'spreadsheets',
                                    'organization']})

In [17]:
pd.merge(df1, df5)  

Unnamed: 0,employee,department,competence
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


In [18]:
df6 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa',
                                 'Sue', 'Lea'],                    
                    'department': ['Accounting', 'Engineering',
                                   'Engineering', 'HR',
                                   'Engineering']})

df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'date': [2004, 2008, 2012, 2014]})

In [19]:
pd.merge(df6, df2)

Unnamed: 0,employee,department,date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [20]:
df = pd.DataFrame({'key':['A','B','C','A','B','C'],
                   'data':range(6)}, columns=['key','data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [21]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [22]:
rng = np.random.RandomState(42)
# A series with five random elements
ser = pd.Series(rng.rand(5))
print(ser.sum())
print(ser.mean())

2.811925491708157
0.5623850983416314


In [23]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})

# Per column
print(df.mean())
# Per row
print(df.mean(axis='columns'))

A    0.477888
B    0.443420
dtype: float64
0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64


In [24]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],   
                   'data1': range(6),
                   'data2': [10,11,10,9,10,10]})
print(df)

  key  data1  data2
0   A      0     10
1   B      1     11
2   C      2     10
3   A      3      9
4   B      4     10
5   C      5     10


In [25]:
gb = df.groupby('key')

In [26]:
print(gb.sum())
print(gb.mean())

     data1  data2
key              
A        3     19
B        5     21
C        7     20
     data1  data2
key              
A      1.5    9.5
B      2.5   10.5
C      3.5   10.0
