# Pandas Data Structures

## 1 Series

In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj.index[1]

1

### We can change labels directly

In [7]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj.index = ['Bob', 'James', 'John', 'William']

In [9]:
obj

Bob        4
James      7
John      -5
William    3
dtype: int64

### Also we can make specific labels as we want

In [10]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [12]:
obj2.index[2]

'a'

### we can choose values by index

In [13]:
obj2['a']

-5

### modify values

In [14]:
obj2['d'] = 6

In [15]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [16]:
obj2[['c', 'a', 'd']]  # note two brackets

c    3
a   -5
d    6
dtype: int64

In [17]:
obj2[obj2 > 2]

d    6
b    7
c    3
dtype: int64

In [18]:
obj2 * 2  # not changing the original ones 

d    12
b    14
a   -10
c     6
dtype: int64

In [19]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

### combination with numpy

In [20]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

### The series can be thought of as a special dictionary

In [21]:
'b' in obj2

True

In [22]:
'f' in obj2

False

### we can use dictionaries to build a series

In [23]:
sdata = {'Ohio':35000, 'Texas':71000, 'Utah':5000}

In [24]:
obj3 = pd.Series(sdata)
obj3

Ohio     35000
Texas    71000
Utah      5000
dtype: int64

### The index in the series actually is the key for dictionaries,  we can modify it

In [25]:
states = ['California', 'Ohio', 'Utah', 'Texas']

In [26]:
# We can define the orders by ourselves
obj4 = pd.Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Utah           5000.0
Texas         71000.0
dtype: float64

### Since it did not find 'California', so it appears 'Nan', We use \snull or \notnull to detect these values

In [27]:
pd.isnull(obj4)

California     True
Ohio          False
Utah          False
Texas         False
dtype: bool

In [28]:
pd.notnull(obj4)

California    False
Ohio           True
Utah           True
Texas          True
dtype: bool

In [29]:
# or we use series afflix directly
obj4.isnull()

California     True
Ohio          False
Utah          False
Texas         False
dtype: bool

### Data Alignment features of Series

In [30]:
obj3

Ohio     35000
Texas    71000
Utah      5000
dtype: int64

In [31]:
obj4

California        NaN
Ohio          35000.0
Utah           5000.0
Texas         71000.0
dtype: float64

In [32]:
obj3 + obj4   # add corresponding ones together and sort them by alphabet orders

California         NaN
Ohio           70000.0
Texas         142000.0
Utah           10000.0
dtype: float64

### The series and the index all have a feature 'name'

In [33]:
obj4.name = 'population'

In [34]:
obj4.index.name = 'State'

In [35]:
obj4

State
California        NaN
Ohio          35000.0
Utah           5000.0
Texas         71000.0
Name: population, dtype: float64

# 2   DataFrame

### Generally the dataframe is a rectangular table,  each column can contain different types of data (numbers, strings, boolean values).

### We can think of it as the combinations  of several series, sharing the same index. Data are stored in different clusters.

In [36]:
#  We first construct a dataframe from a dictionary
#  all values in the dictionary are lists 

data = {'State': ['Ohio', 'Ohio', 'Ohio', 'Nevada','Nevada','Nevada'],
        'Year': [2001, 2002, 2003, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,State,Year,pop
0,Ohio,2001,1.5
1,Ohio,2002,1.7
2,Ohio,2003,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


### For a large dataframe, we usually use the \head function to return the first 5 rows to see what it contains ( a method usually used in data analysis)

In [37]:
frame.head()

Unnamed: 0,State,Year,pop
0,Ohio,2001,1.5
1,Ohio,2002,1.7
2,Ohio,2003,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


### Of course we can specify the column orders

In [38]:
pd.DataFrame(data, columns=['Year', 'State', 'pop'])

Unnamed: 0,Year,State,pop
0,2001,Ohio,1.5
1,2002,Ohio,1.7
2,2003,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


### If including a name not exist, it will appears 'Nan'

In [39]:
frame2 = pd.DataFrame(data, columns = ['Year', 'State', 'pop', 'debt'],
                      index = ['a', 'b', 'c', 'd', 'e', 'f'])

In [40]:
frame2

Unnamed: 0,Year,State,pop,debt
a,2001,Ohio,1.5,
b,2002,Ohio,1.7,
c,2003,Ohio,3.6,
d,2001,Nevada,2.4,
e,2002,Nevada,2.9,
f,2003,Nevada,3.2,


In [41]:
frame2.columns

Index(['Year', 'State', 'pop', 'debt'], dtype='object')

### If extract one column from dataframe, it will return a series, we can index like a dictionary  or using natures

In [42]:
frame2['State']   # index like a dictionary

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [43]:
frame2.Year   # using natures afflix

a    2001
b    2002
c    2003
d    2001
e    2002
f    2003
Name: Year, dtype: int64

### For rows, we use \loc to index

In [44]:
frame2.loc['b']

Year     2002
State    Ohio
pop       1.7
debt      NaN
Name: b, dtype: object

### We can change column values by assigning values

In [45]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,Year,State,pop,debt
a,2001,Ohio,1.5,16.5
b,2002,Ohio,1.7,16.5
c,2003,Ohio,3.6,16.5
d,2001,Nevada,2.4,16.5
e,2002,Nevada,2.9,16.5
f,2003,Nevada,3.2,16.5


In [46]:
# using value generating functions
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,Year,State,pop,debt
a,2001,Ohio,1.5,0
b,2002,Ohio,1.7,1
c,2003,Ohio,3.6,2
d,2001,Nevada,2.4,3
e,2002,Nevada,2.9,4
f,2003,Nevada,3.2,5


### Note if assign a list or an array to columns, it must match with the length of DataFrame.
### We can also assign series to columns, then it will match according to the index of DataFrame and appears 'NaN' when no values are specified at that position.

In [47]:
val = pd.Series([-1.2, -1.5, -1.7], index = ['b', 'd', 'e'])

frame2.debt = val
frame2

Unnamed: 0,Year,State,pop,debt
a,2001,Ohio,1.5,
b,2002,Ohio,1.7,-1.2
c,2003,Ohio,3.6,
d,2001,Nevada,2.4,-1.5
e,2002,Nevada,2.9,-1.7
f,2003,Nevada,3.2,


In [48]:
val = pd.Series([-1.2, -1.5, -1.7, 1.6, 1.8, 1.9])   # if did not specify index, it will make no changes 

frame2.debt = val
frame2

Unnamed: 0,Year,State,pop,debt
a,2001,Ohio,1.5,
b,2002,Ohio,1.7,
c,2003,Ohio,3.6,
d,2001,Nevada,2.4,
e,2002,Nevada,2.9,
f,2003,Nevada,3.2,


### If the column does not exist, it will create a new column

In [49]:
frame2['eastern'] = frame2.State  == 'Ohio'
frame2

Unnamed: 0,Year,State,pop,debt,eastern
a,2001,Ohio,1.5,,True
b,2002,Ohio,1.7,,True
c,2003,Ohio,3.6,,True
d,2001,Nevada,2.4,,False
e,2002,Nevada,2.9,,False
f,2003,Nevada,3.2,,False


### We can delete columns as dictionaries

In [50]:
del frame2['eastern']

In [51]:
frame2.columns

Index(['Year', 'State', 'pop', 'debt'], dtype='object')

### Note the columns returns a view, so every change on series will reflect on original DataFrames

### One special form is the dictionary in dictionary

In [53]:
pop = {'Nevada': {2001: 1.2, 2002: 1.5},
       'Ohio': {2000: 1.0, 2001:1.8, 2002:1.4}}

frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.0
2001,1.2,1.8
2002,1.5,1.4


### Note it take the external key as columns, the internal key as index

### Interesting thing is that the frame can also be transposed 

In [55]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,1.2,1.5
Ohio,1.0,1.8,1.4


In [56]:
# dictionary composed by series

pdata = {'Nevada': frame3['Nevada'][:-1],
         'Ohio': frame3['Ohio'][:2] }

pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.0
2001,1.2,1.8


### The columns and index can have its own names

In [57]:
frame3.index.name = 'Year'
frame3.columns.name = 'States'

frame3

States,Nevada,Ohio
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.0
2001,1.2,1.8
2002,1.5,1.4


### The \values afflix will return an array

In [63]:
frame3.values

array([[nan, 1. ],
       [1.2, 1.8],
       [1.5, 1.4]])

In [64]:
type(frame3.values)

numpy.ndarray

### If the columns have different types, then dtype will accustom to all of them

In [65]:
frame.values

array([['Ohio', 2001, 1.5],
       ['Ohio', 2002, 1.7],
       ['Ohio', 2003, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9],
       ['Nevada', 2003, 3.2]], dtype=object)

# 3 Index Objects

In [67]:
obj = pd.Series( range(3), index = ['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [70]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [71]:
index[1:]

Index(['b', 'c'], dtype='object')

### index object is unchangeable, thus it is very stable to share with other data structures

In [72]:
index[1] = 'd'

TypeError: Index does not support mutable operations

In [77]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [79]:
obj2 = pd.Series([-1, 1.2, 1.5], index = labels)
obj2

0   -1.0
1    1.2
2    1.5
dtype: float64

In [80]:
obj2.index is labels

True

### Besides arrays, index objects can also be a set, but unlike set in numpy, it can contain repeating elements

In [81]:
frame3

States,Nevada,Ohio
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.0
2001,1.2,1.8
2002,1.5,1.4


In [82]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='States')

In [83]:
'Ohio' in frame3.columns

True

In [84]:
'2002' in frame3.columns

False

In [85]:
dup_labels = pd.Index(['tea', 'tea', 'jack', 'jack'])
dup_labels

Index(['tea', 'tea', 'jack', 'jack'], dtype='object')