## 5.1	Introduction	to	pandas	Data	Structures

### Series

In [16]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
obj = pd.Series(list(range(5)))

In [4]:
obj

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [6]:
obj.values

array([0, 1, 2, 3, 4])

In [7]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
# with self-define index

obj = pd.Series(list(range(3)), index=['a', 'b', 'c'])

In [9]:
obj

a    0
b    1
c    2
dtype: int64

In [10]:
obj.index

Index(['a', 'b', 'c'], dtype='object')

In [11]:
obj['a']

0

In [12]:
obj['c']

2

In [13]:
obj[['c', 'a', 'b']]

c    2
a    0
b    1
dtype: int64

In [14]:
obj[obj>0]

b    1
c    2
dtype: int64

In [15]:
obj * 2

a    0
b    2
c    4
dtype: int64

In [17]:
np.exp(obj)

a    1.000000
b    2.718282
c    7.389056
dtype: float64

In [18]:
'b' in obj

True

In [19]:
'd' in obj

False

In [20]:
1 in obj

False

In [21]:
1 in obj.values

True

In [22]:
# passing dict to pandas

dct = {i: i+10 for i in range(1,4)}

In [23]:
obj2 = pd.Series(dct)

In [24]:
obj2

1    11
2    12
3    13
dtype: int64

In [25]:
# how about ordereddict
from collections import OrderedDict

In [26]:
od = OrderedDict([(i, i+10) for i in range(1,4)])

In [27]:
obj_2_1 = pd.Series(od)

In [29]:
obj_2_1

1    11
2    12
3    13
dtype: int64

In [32]:
obj2 = pd.Series(dct, index=[3,2,1,0])

In [33]:
obj2

3    13.0
2    12.0
1    11.0
0     NaN
dtype: float64

In [36]:
# check null value

In [34]:
pd.isnull(obj2)

3    False
2    False
1    False
0     True
dtype: bool

In [35]:
pd.notnull(obj2)

3     True
2     True
1     True
0    False
dtype: bool

In [40]:
# arithmatic operations

In [37]:
obj2

3    13.0
2    12.0
1    11.0
0     NaN
dtype: float64

In [38]:
obj_2_1

1    11
2    12
3    13
dtype: int64

In [39]:
obj2 + obj_2_1

0     NaN
1    22.0
2    24.0
3    26.0
dtype: float64

In [41]:
obj2 * obj_2_1

0      NaN
1    121.0
2    144.0
3    169.0
dtype: float64

In [42]:
# assign attribute

In [43]:
obj2.name = 'zx'

In [44]:
obj2.index.name = 'num'

In [45]:
obj2

num
3    13.0
2    12.0
1    11.0
0     NaN
Name: zx, dtype: float64

In [46]:
# re-assign index names

obj2.index = ['a', 'b', 'c', 'd']

In [47]:
obj2

a    13.0
b    12.0
c    11.0
d     NaN
Name: zx, dtype: float64

### DataFrame

In [48]:
data = {
    'state': ['ohio', 'ohio', 'ohio', 'nevada', 'navada', 'navada'],
    'year': [2017, 2016, 2015, 2017, 2016, 2015],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 2.2]
}

In [49]:
frame = pd.DataFrame(data)

In [50]:
frame

Unnamed: 0,pop,state,year
0,1.5,ohio,2017
1,1.7,ohio,2016
2,3.6,ohio,2015
3,2.4,nevada,2017
4,2.9,navada,2016
5,2.2,navada,2015


In [51]:
frame.head()  # for first five items

Unnamed: 0,pop,state,year
0,1.5,ohio,2017
1,1.7,ohio,2016
2,3.6,ohio,2015
3,2.4,nevada,2017
4,2.9,navada,2016


In [52]:
# specify assigned order
frame2 = pd.DataFrame(data, columns=['year', 'pop', 'state'])

In [53]:
frame2

Unnamed: 0,year,pop,state
0,2017,1.5,ohio
1,2016,1.7,ohio
2,2015,3.6,ohio
3,2017,2.4,nevada
4,2016,2.9,navada
5,2015,2.2,navada


In [54]:
# specify index

frame3 = pd.DataFrame(data, index=['a', 'b', 'c', 'd', 'e', 'f'])

In [55]:
frame3

Unnamed: 0,pop,state,year
a,1.5,ohio,2017
b,1.7,ohio,2016
c,3.6,ohio,2015
d,2.4,nevada,2017
e,2.9,navada,2016
f,2.2,navada,2015


In [57]:
frame3.columns

Index(['pop', 'state', 'year'], dtype='object')

In [58]:
frame['year']

0    2017
1    2016
2    2015
3    2017
4    2016
5    2015
Name: year, dtype: int64

In [59]:
frame['state']

0      ohio
1      ohio
2      ohio
3    nevada
4    navada
5    navada
Name: state, dtype: object

In [60]:
frame.loc[1]

pop       1.7
state    ohio
year     2016
Name: 1, dtype: object

In [61]:
# assigned value
frame['debt'] = 10

In [62]:
frame

Unnamed: 0,pop,state,year,debt
0,1.5,ohio,2017,10
1,1.7,ohio,2016,10
2,3.6,ohio,2015,10
3,2.4,nevada,2017,10
4,2.9,navada,2016,10
5,2.2,navada,2015,10


In [63]:
frame['acre'] = [1,2,3,4,5,6]

In [64]:
frame

Unnamed: 0,pop,state,year,debt,acre
0,1.5,ohio,2017,10,1
1,1.7,ohio,2016,10,2
2,3.6,ohio,2015,10,3
3,2.4,nevada,2017,10,4
4,2.9,navada,2016,10,5
5,2.2,navada,2015,10,6


In [66]:
val = pd.Series([1.2, -1.5, -1.7], index=[1,3,5])

In [67]:
frame['debt'] = val

In [68]:
frame

Unnamed: 0,pop,state,year,debt,acre
0,1.5,ohio,2017,,1
1,1.7,ohio,2016,1.2,2
2,3.6,ohio,2015,,3
3,2.4,nevada,2017,-1.5,4
4,2.9,navada,2016,,5
5,2.2,navada,2015,-1.7,6


In [70]:
frame['acre'] = frame['pop'] > 2

In [71]:
frame

Unnamed: 0,pop,state,year,debt,acre
0,1.5,ohio,2017,,False
1,1.7,ohio,2016,1.2,False
2,3.6,ohio,2015,,True
3,2.4,nevada,2017,-1.5,True
4,2.9,navada,2016,,True
5,2.2,navada,2015,-1.7,True


In [74]:
# delete a column

In [72]:
del frame['debt']

In [73]:
frame

Unnamed: 0,pop,state,year,acre
0,1.5,ohio,2017,False
1,1.7,ohio,2016,False
2,3.6,ohio,2015,True
3,2.4,nevada,2017,True
4,2.9,navada,2016,True
5,2.2,navada,2015,True


In [79]:
# dict nested in dict
pop = {
    'ohio': {2001: 1.5, 2002:1.6},
    'nevada': {2001: 2.2, 2002: 2.4, 2003: 2.3}
}

In [80]:
pop

{'nevada': {2001: 2.2, 2002: 2.4, 2003: 2.3}, 'ohio': {2001: 1.5, 2002: 1.6}}

In [81]:
frame2 = pd.DataFrame(pop)

In [82]:
frame2

Unnamed: 0,nevada,ohio
2001,2.2,1.5
2002,2.4,1.6
2003,2.3,


In [83]:
# transpose the dataframe

frame2.T

Unnamed: 0,2001,2002,2003
nevada,2.2,2.4,2.3
ohio,1.5,1.6,


In [84]:
pd.DataFrame(pop, index=[2001, 2003, 2002])

Unnamed: 0,nevada,ohio
2001,2.2,1.5
2003,2.3,
2002,2.4,1.6


In [85]:
# specify index name and column name

frame2.index.name ='year'
frame2.columns.name = 'state'

In [86]:
frame2

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.2,1.5
2002,2.4,1.6
2003,2.3,


In [88]:
# return values

In [87]:
frame2.values

array([[ 2.2,  1.5],
       [ 2.4,  1.6],
       [ 2.3,  nan]])

In [90]:
frame

Unnamed: 0,pop,state,year,acre
0,1.5,ohio,2017,False
1,1.7,ohio,2016,False
2,3.6,ohio,2015,True
3,2.4,nevada,2017,True
4,2.9,navada,2016,True
5,2.2,navada,2015,True


In [89]:
frame.values

array([[1.5, 'ohio', 2017, False],
       [1.7, 'ohio', 2016, False],
       [3.6, 'ohio', 2015, True],
       [2.4, 'nevada', 2017, True],
       [2.9, 'navada', 2016, True],
       [2.2, 'navada', 2015, True]], dtype=object)

### index objects

In [91]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [92]:
obj

a    0
b    1
c    2
dtype: int64

In [93]:
index = obj.index

In [94]:
index

Index(['a', 'b', 'c'], dtype='object')

In [95]:
index[1:]

Index(['b', 'c'], dtype='object')

In [97]:
# index value cannot be modified
# wrong
# index[1] = 'd'

In [98]:
labels = pd.Index(np.arange(3))

In [99]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [101]:
obj2 = pd.Series([1.5, -2, 3], index=labels)

In [102]:
obj2

0    1.5
1   -2.0
2    3.0
dtype: float64

In [103]:
# 

frame2

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.2,1.5
2002,2.4,1.6
2003,2.3,


In [104]:
frame2.columns

Index(['nevada', 'ohio'], dtype='object', name='state')

In [105]:
'ohio' in frame2.columns

True

In [106]:
2001 in frame2.index

True

In [107]:
# index can be repeated

dup_labels = pd.Index(['a', 'b', 'a'])


In [108]:
obj

a    0
b    1
c    2
dtype: int64

In [109]:
pd.DataFrame(obj, index=dup_labels)

Unnamed: 0,0
a,0
b,1
a,0
