# Introduction to Data Structures 

http://pandas.pydata.org/pandas-docs/stable/dsintro.html 

In [1]:
import numpy as np
import pandas as pd

## Series 

A **series** is a one-dimensional labelled array that can hold any (single) data type. The axis labels are called the **index**. 

In [2]:
# Creating a series from an ndarray
# What is an ndarray: 
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html

s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
print(s)

a   -1.058399
b   -0.085628
c    1.373823
d    0.735940
e    0.084058
dtype: float64


In [3]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
# Without specifiying the index, index is range(whatever)
pd.Series(np.random.randn(10))

0   -1.550420
1   -0.632872
2   -0.661243
3   -0.129520
4   -0.845718
5   -0.761466
6    0.299369
7    0.906608
8   -0.258728
9   -1.300462
dtype: float64

In [5]:
# Creating a series from a dict

d = {'a':0., 'b':1., 'c':2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [6]:
# Switches the order around and adds a new index w/o 
# corresponding data

pd.Series(d, index=['b','c','d','a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [7]:
# Creating a series from a single scalar using broadcasting

pd.Series(5., index=['a','b','c','d','e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [8]:
# Using numpy ndarray operations and slicing

s

a   -1.058399
b   -0.085628
c    1.373823
d    0.735940
e    0.084058
dtype: float64

### Series are ndarray-like

In [9]:
s[0]

-1.0583988600619376

In [10]:
s[:3]

a   -1.058399
b   -0.085628
c    1.373823
dtype: float64

In [11]:
s[s > s.median()]

c    1.373823
d    0.735940
dtype: float64

In [12]:
# Subsetting by passing a LIST of rows 

s[[4,3,1]]

e    0.084058
d    0.735940
b   -0.085628
dtype: float64

In [13]:
# NP functions are vectorized 

np.exp(s)

a    0.347011
b    0.917935
c    3.950422
d    2.087444
e    1.087692
dtype: float64

### Series are also dict-like

In [14]:
s

a   -1.058399
b   -0.085628
c    1.373823
d    0.735940
e    0.084058
dtype: float64

In [15]:
s['a']

-1.0583988600619376

In [16]:
s['e']

0.08405829428236071

In [17]:
'e' in s

True

In [18]:
'f' in s

False

So there are TWO ways to access series data: 

1. By passing the index as a string
2. By passing the location as an int

In [19]:
s[0]

-1.0583988600619376

In [20]:
s['a']

-1.0583988600619376

## DataFrames

These are 2-dimensional labelled data structures with columns of potentially different types. Also can be thought of as a dict of Series. 

### DataFrame from dict of Series (or dict of Dicts)

In [21]:
d = {'one':pd.Series(np.random.randn(5), index=['a','b','c','d','e']),
     'two':pd.Series(np.random.randn(3), index=['a','b','x'])
}


df = pd.DataFrame(d)

In [22]:
df

Unnamed: 0,one,two
a,-1.479847,0.321168
b,0.75668,0.40706
c,1.829063,
d,-1.297602,
e,-0.81262,
x,,-1.544631


The index is the union of the indices of the Series. 

In [23]:
pd.DataFrame(d, index=['a','b','c'])

Unnamed: 0,one,two
a,-1.479847,0.321168
b,0.75668,0.40706
c,1.829063,


In [24]:
pd.DataFrame(d, index=['a', 'Blue', 'Yellow'], columns = ['one', 'three'])

Unnamed: 0,one,three
a,-1.479847,
Blue,,
Yellow,,


In [25]:
# Another example 

s1 = {'a': 10, 'b': 20, 'c': 30}
s2 = {'John': 1.1, 'Paul': 2.2, 'George': 3.3, 'Ringo':4.4}

d2 = {'one': s1, 'two': s2}
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,one,two
George,,3.3
John,,1.1
Paul,,2.2
Ringo,,4.4
a,10.0,
b,20.0,
c,30.0,


In [26]:
s3 = {'a': 3.1, 'b':4.5}
df3 = pd.DataFrame({'s1': s1, 's3':s3})
df3

Unnamed: 0,s1,s3
a,10,3.1
b,20,4.5
c,30,


In [27]:
# Creates a new data frame from df3, using the existing 'a' index from df3
# plus a new index 'x' that is undefined 
pd.DataFrame(df3, index=['a', 'x'])

Unnamed: 0,s1,s3
a,10.0,3.1
x,,


In [28]:
# New data frame using contents of df3
# Two existing indices and a new undefined one
# One existing column and a new undefined one 

pd.DataFrame(df3, index=['b','c','z'], columns=['blue', 's3'])

Unnamed: 0,blue,s3
b,,4.5
c,,
z,,


In [29]:
df.columns

Index(['one', 'two'], dtype='object')

In [30]:
df.index

Index(['a', 'b', 'c', 'd', 'e', 'x'], dtype='object')

### DataFrame from dict of ndarrays or lists 

In [31]:
d = {'one':[1.,2.,3.,4.], 'two':[4., 3., 2., 1]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [32]:
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


So, in dict-of-lists form

- The keys of the dict are the column labels
- The values for each key are the row values 

### DataFrame from list of dicts

In [33]:
data2 = [{'a':1, 'b':2}, {'a':5, 'b':10, 'c': 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In list-of-dicts format, the keys are the column labels (still) and the values for each key are still the row values.

### Column selection, addition, and deletion

In [34]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [35]:
# Selects the column

df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [36]:
df['two']

a    1.0
b    2.0
c    3.0
d    4.0
Name: two, dtype: float64

In [37]:
df['three'] = df['one'] * df['two']
df

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


In [38]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


So `df['name']` selects the **column** `name` from `df`. This is different than series in which this syntax selects the **row**: 

In [39]:
s = pd.Series({'x': 5, 'y': 10, 'z': 20})
s['x']

5

In [40]:
df['foo'] = 'bar'
df

Unnamed: 0,one,two,three,flag,foo
a,1.0,1.0,1.0,False,bar
b,2.0,2.0,4.0,False,bar
c,3.0,3.0,9.0,True,bar
d,,4.0,,False,bar


To get a **row**: This is what `loc` and `iloc` are for. 

### Row selection

In [41]:
df

Unnamed: 0,one,two,three,flag,foo
a,1.0,1.0,1.0,False,bar
b,2.0,2.0,4.0,False,bar
c,3.0,3.0,9.0,True,bar
d,,4.0,,False,bar


In [42]:
df.loc['a']

one          1
two          1
three        1
flag     False
foo        bar
Name: a, dtype: object

In [43]:
df.iloc[2]

one         3
two         3
three       9
flag     True
foo       bar
Name: c, dtype: object

BUT we can use slicing without the `loc` or `iloc` to get rows: 

In [44]:
df[2:4]

Unnamed: 0,one,two,three,flag,foo
c,3.0,3.0,9.0,True,bar
d,,4.0,,False,bar


In [45]:
df[:-1]

Unnamed: 0,one,two,three,flag,foo
a,1.0,1.0,1.0,False,bar
b,2.0,2.0,4.0,False,bar
c,3.0,3.0,9.0,True,bar


...or by passing a boolean vector

In [46]:
df[[True,False,True,True]]

Unnamed: 0,one,two,three,flag,foo
a,1.0,1.0,1.0,False,bar
c,3.0,3.0,9.0,True,bar
d,,4.0,,False,bar


From the documentation with extras:

| Operation | Syntax | Result | 
|:----------|:-------|:-------|
|Select column | `df['colname']` | Series | 
|Select column b name | `df.Name` | Series |
|Select row by label | `df.loc['rowname']` | Series |
|Select row by integer location | `df.iloc[int]` | Series | 
|Select a slice of rows | `df[a:b]` | DataFrame | 
|Select rows by boolean vector | `df[boolvect]` | DataFrame| 
|Select a selection of columns | `df[list of cols]` | DataFrame | 
|Select a slice of columns | `df.loc[:, a:b]` | DataFrame | 


Example of the last two:


In [47]:
df.loc[:, 'two':'flag']

Unnamed: 0,two,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,4.0,,False


In [48]:
df[['two', 'flag']]

Unnamed: 0,two,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,4.0,False


### DataFrame Arithmetic

In [52]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7,3), columns=['A', 'B', 'C'])

In [53]:
df

Unnamed: 0,A,B,C,D
0,1.044538,-0.217392,0.070399,0.59799
1,0.383078,1.096602,1.899365,-1.691479
2,-1.333413,-0.786211,0.96389,-0.997049
3,-0.732239,0.027029,0.01551,-1.444083
4,0.703614,0.679537,0.339525,0.002012
5,0.373104,-0.302029,1.484094,0.344695
6,-1.357078,1.398729,0.066742,-0.495384
7,0.716465,-0.95843,-0.190507,-0.706965
8,1.080062,-0.819891,2.080093,1.398244
9,0.847034,-0.534891,-0.044211,0.205606


In [54]:
df2

Unnamed: 0,A,B,C
0,1.095805,-0.466587,-0.321851
1,-0.673696,-0.824631,-1.778407
2,-1.410752,-1.340187,1.269449
3,0.271052,0.358754,-0.332981
4,1.078408,-0.056481,0.662686
5,-0.113831,0.017959,-0.579881
6,-2.824457,0.50775,-0.082788


In [56]:
# Alignment takes place along both axes
df + df2

Unnamed: 0,A,B,C,D
0,2.140342,-0.683978,-0.251452,
1,-0.290618,0.271971,0.120958,
2,-2.744165,-2.126398,2.233339,
3,-0.461187,0.385784,-0.317471,
4,1.782022,0.623056,1.002211,
5,0.259273,-0.284069,0.904212,
6,-4.181535,1.906479,-0.016045,
7,,,,
8,,,,
9,,,,


In [57]:
3*df

Unnamed: 0,A,B,C,D
0,3.133613,-0.652175,0.211198,1.793969
1,1.149234,3.289805,5.698095,-5.074438
2,-4.000239,-2.358634,2.89167,-2.991148
3,-2.196717,0.081088,0.04653,-4.332249
4,2.110842,2.038611,1.018575,0.006036
5,1.119312,-0.906086,4.452281,1.034086
6,-4.071233,4.196187,0.200227,-1.486151
7,2.149395,-2.87529,-0.571522,-2.120896
8,3.240185,-2.459673,6.240278,4.194731
9,2.541101,-1.604673,-0.132633,0.616817


In [58]:
# Numpy functions work too 

df3 = pd.DataFrame(np.random.randn(5, 2), columns=['Sunday', 'Monday'])

In [59]:
df3

Unnamed: 0,Sunday,Monday
0,0.416552,0.885032
1,0.318968,-0.730178
2,-0.357217,-0.896566
3,-0.190814,-0.409839
4,-0.307335,1.507348


In [60]:
np.exp(df3)

Unnamed: 0,Sunday,Monday
0,1.516723,2.423061
1,1.375708,0.481823
2,0.699621,0.407968
3,0.826286,0.663757
4,0.735404,4.514741


In [62]:
np.log(np.exp(df3)) - df3

Unnamed: 0,Sunday,Monday
0,5.5511150000000004e-17,-1.110223e-16
1,0.0,0.0
2,0.0,0.0
3,5.5511150000000004e-17,0.0
4,5.5511150000000004e-17,0.0


### Accessing columns in a method-like way

In [64]:
df.A

0    1.044538
1    0.383078
2   -1.333413
3   -0.732239
4    0.703614
5    0.373104
6   -1.357078
7    0.716465
8    1.080062
9    0.847034
Name: A, dtype: float64

In [65]:
df3

Unnamed: 0,Sunday,Monday
0,0.416552,0.885032
1,0.318968,-0.730178
2,-0.357217,-0.896566
3,-0.190814,-0.409839
4,-0.307335,1.507348


In [70]:
type(df3.Monday)

pandas.core.series.Series

In [69]:
type(df3['Monday'])

pandas.core.series.Series