In [1]:
import pandas as pd 
import numpy as np

## The Pandas Series Object

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
data[1]

0.5

In [6]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data['b']

0.5

In [9]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
 index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [10]:
data.sort_index(inplace=True)

In [11]:
data

2    0.25
3    0.75
5    0.50
7    1.00
dtype: float64

In [12]:
data[5]

0.5

In [13]:
population_dict = {'California': 38332521,
 'Texas': 26448193,
 'New York': 19651127,
 'Florida': 19552860,
 'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [14]:
population['California']

38332521

In [15]:
population['California':'Florida']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

## Constructing Series objects

In [16]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [17]:
pd.Series(5,index=[2,3,4,5])

2    5
3    5
4    5
5    5
dtype: int64

In [18]:
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In [19]:
pd.Series({2:'a',1:'b',3:'c'}, index=[2,3])

2    a
3    c
dtype: object

## From a one-dimensional NumPy array.

In [20]:
pd.Series(np.arange(1,5),index=range(0,4))

0    1
1    2
2    3
3    4
dtype: int32

## The Pandas DataFrame Object

In [21]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
 'Florida': 170312, 'Illinois': 149995}

In [22]:
area=pd.Series(area_dict)

In [23]:
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [24]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

## From a dictionary of Series objects. 

In [25]:
states=pd.DataFrame({'population':population,'area':area})

In [26]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [27]:
state=pd.DataFrame({'population':population,'area':area},columns=['population'])

In [28]:
state

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [29]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [30]:
states.columns

Index(['population', 'area'], dtype='object')

In [31]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

In [32]:
print(states['area'])
print(states['population'])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64


In [33]:
### for single series object

In [34]:
pd.DataFrame([population,area],columns=['population','area'])

Unnamed: 0,population,area
0,,
1,,


In [35]:
pd.DataFrame(population)

Unnamed: 0,0
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [36]:
data=[{"a":i,'b':3*i} for i in range(1,4)]

In [37]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,1,3
1,2,6
2,3,9


In [38]:
pd.DataFrame(data,index=['a','b','c'],columns=["a"])

Unnamed: 0,a
a,1
b,2
c,3


In [39]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


## From a one-dimensional NumPy array. 

In [40]:
a=pd.DataFrame(np.arange(1,4),columns=['a'])
a.index.name="1ddf"
print(a)
type(a)

      a
1ddf   
0     1
1     2
2     3


pandas.core.frame.DataFrame

## From a two-dimensional NumPy array. 

In [41]:
pd.DataFrame(np.random.rand(3, 2),
 columns=['foo', 'bar'],
 index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.028836,0.63517
b,0.108035,0.195962
c,0.627022,0.07425


## From a NumPy structured array.

In [42]:
A=np.zeros(3,dtype=[("A",'int64'),('B','float64')])
a=pd.DataFrame(A)
a

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## The Pandas Index Object


In [43]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [44]:
ind[1]

3

In [45]:
ind[2:4]

Int64Index([5, 7], dtype='int64')

In [46]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [47]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [48]:
ind[1] = 0

TypeError: Index does not support mutable operations

## Index as ordered set

In [49]:
indA = pd.Index([1, 3, 5, 7, 9])

indB = pd.Index([2, 3, 5, 7, 11])

In [50]:
indA|indB #union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [51]:
indA & indB # intersection

Int64Index([3, 5, 7], dtype='int64')

In [52]:
indA^indB # symmeteric Difference

Int64Index([1, 2, 9, 11], dtype='int64')

## Data Indexing and Selection

### Data Selection in Series


In [54]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', "b", 'c', 'd'])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [55]:
data["a"]

0.25

In [56]:
data[1] # if there any index will be 1 it give expicitly

0.5

In [57]:

"a" in data

True

In [58]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [59]:
data.keys

<bound method Series.keys of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>

In [60]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [61]:
data['a':'e']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [62]:
data[1:4]

b    0.50
c    0.75
d    1.00
dtype: float64

In [63]:
data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [64]:
data.loc[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [65]:
data[["a",'d']]

a    0.25
d    1.00
dtype: float64

In [66]:
data.loc[["a",'d']]

a    0.25
d    1.00
dtype: float64

In [67]:
data.loc['a']

0.25

In [68]:
data.loc['a':'d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [69]:
data.loc[1:2] # loc work on index and give the value of alast index also

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

In [70]:
data.iloc[1:2]

b    0.5
dtype: float64

In [71]:
data.iloc[3]

1.0

In [72]:
data.iloc[[1,3]]

b    0.5
d    1.0
dtype: float64

In [73]:
data.iloc[(data>0.3)]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

###  DataFrame as a dictionary

In [74]:
area = pd.Series({'California': 423967, 'Texas': 695662,
 'New York': 141297, 'Florida': 170312,
 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
 'New York': 19651127, 'Florida': 19552860,
 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})

data=data.sort_index()


In [75]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [76]:
data['California':]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [77]:
data[data.area > 170000]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Texas,695662,26448193


In [78]:
# cant perform fancy indexing without loc

In [79]:
data.loc['Texas']

area      695662
pop     26448193
Name: Texas, dtype: int64

In [80]:
data.loc['Texas','area']

695662

In [81]:
data.loc['Florida':]

Unnamed: 0,area,pop
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [82]:
data.loc['Florida':,'area']

Florida     170312
Illinois    149995
New York    141297
Texas       695662
Name: area, dtype: int64

In [83]:
data.loc['Florida','area':]

area      170312
pop     19552860
Name: Florida, dtype: int64

In [84]:
data.loc['Florida':,'area':]

Unnamed: 0,area,pop
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [85]:
data.iloc[1]

area      170312
pop     19552860
Name: Florida, dtype: int64

In [86]:
data.iloc[1,1]

19552860

In [87]:
data.iloc[1:]

Unnamed: 0,area,pop
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [88]:
data.iloc[1:,1]

Florida     19552860
Illinois    12882135
New York    19651127
Texas       26448193
Name: pop, dtype: int64

In [89]:
data.iloc[1,:]

area      170312
pop     19552860
Name: Florida, dtype: int64

In [90]:
data.iloc[:2,:2]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860


In [91]:
data.loc[data.area > 170000]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Texas,695662,26448193


In [92]:
data.loc[['California','Florida','Texas'], ['pop']]

Unnamed: 0,pop
California,38332521
Florida,19552860
Texas,26448193


##  Operating on Data in Pandas

### Ufuncs: Index Preservation

In [93]:
import pandas as pd
import numpy as np

In [94]:
ser = pd.Series(np.random.randint(0, 10, 4))
ser

0    4
1    6
2    2
3    6
dtype: int32

In [95]:
df = pd.DataFrame(np.random.randint(0, 10, (3, 4)),
columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,5,7,8,2
1,7,3,9,1
2,5,1,7,8


In [96]:
np.exp(ser)

0     54.598150
1    403.428793
2      7.389056
3    403.428793
dtype: float64

In [97]:
np.sin(df*np.e)

Unnamed: 0,A,B,C,D
0,0.854734,0.177472,0.242453,-0.749046
1,0.177472,0.955081,-0.619578,0.410781
2,0.854734,0.410781,0.177472,0.242453


### UFuncs: Index Alignment

In [98]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
 'New York': 19651127}, name='population')


In [99]:
population/area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [100]:
area.index|population.index # union

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [101]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [102]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [103]:
A = pd.DataFrame(np.random.randint(0, 20, (2, 2)),
 columns=list('AB'))
A

Unnamed: 0,A,B
0,7,6
1,16,4


In [104]:
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)),
 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,7,7,2
1,2,4,2
2,2,4,0


In [105]:
A+B

Unnamed: 0,A,B,C
0,14.0,13.0,
1,20.0,6.0,
2,,,


In [106]:
A.add(B)

Unnamed: 0,A,B,C
0,14.0,13.0,
1,20.0,6.0,
2,,,


In [107]:
A.multiply(B)

Unnamed: 0,A,B,C
0,49.0,42.0,
1,64.0,8.0,
2,,,


In [108]:
A*B

Unnamed: 0,A,B,C
0,49.0,42.0,
1,64.0,8.0,
2,,,


In [109]:
fill=A.stack().mean()
A.multiply(B,fill_value=fill)

Unnamed: 0,A,B,C
0,49.0,42.0,16.5
1,64.0,8.0,16.5
2,33.0,16.5,0.0


In [110]:
B.multiply(A,fill_value=fill)

Unnamed: 0,A,B,C
0,49.0,42.0,16.5
1,64.0,8.0,16.5
2,33.0,16.5,0.0


In [111]:
A.multiply(B)

Unnamed: 0,A,B,C
0,49.0,42.0,
1,64.0,8.0,
2,,,


In [112]:
#   + add()
#   - sub(), subtract()
#   * mul(), multiply()
#   / truediv(), div(), divide()
#   // floordiv()
#   % mod()
#   ** pow()

## Ufuncs: Operations Between DataFrame and Series

In [113]:
A =np.random.randint(10, size=(3, 4))
A


array([[7, 2, 2, 5],
       [1, 4, 2, 2],
       [7, 2, 9, 7]])

In [114]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-6,  2,  0, -3],
       [ 0,  0,  7,  2]])

In [115]:
df = pd.DataFrame(A, columns=list('QRST'))
print(df)
df - df.iloc[0]

   Q  R  S  T
0  7  2  2  5
1  1  4  2  2
2  7  2  9  7


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-6,2,0,-3
2,0,0,7,2


In [116]:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,5,0,0,3
1,-3,0,-2,-2
2,5,0,7,5


In [117]:
halfrow = df.iloc[0, ::2]
halfrow


Q    7
S    2
Name: 0, dtype: int32

In [118]:
df-halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-6.0,,0.0,
2,0.0,,7.0,


## Handling Missing Data

### Operating on Null Values

####  isnull() ->Generate a Boolean mask indicating missing values
####  notnull() -> Opposite of isnull()
#### dropna() -> Return a filtered version of the data
#### fillna() -> Return a copy of the data with missing values filled or imputed


In [119]:
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [120]:
data[data.isnull()]

1     NaN
3    None
dtype: object

In [121]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [122]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [123]:
data.dropna()

0        1
2    hello
dtype: object

In [124]:
df = pd.DataFrame([[1, np.nan, 2],
 [2, 3, 5],
 [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [125]:
df.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,False
2,True,False,False


In [126]:
df[df.isnull()]

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,


In [127]:
df[df.notnull()]

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [128]:
data.dropna()

0        1
2    hello
dtype: object

In [129]:
df = pd.DataFrame([[1, np.nan, 2],[2, 3, 5],[np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [130]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [131]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [132]:
df[3] = np.nan
df


Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [133]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [134]:
df.dropna(axis='columns', how='any')

Unnamed: 0,2
0,2
1,5
2,6


In [135]:
df.dropna(axis='rows', thresh=3)


Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [136]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [137]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [138]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [139]:
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [140]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [141]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [142]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [143]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,2.0,
1,2.0,3.0,5.0,
2,4.0,4.0,6.0,


In [144]:
df.fillna(method='bfill', axis=0)

Unnamed: 0,0,1,2,3
0,1.0,3.0,2,
1,2.0,3.0,5,
2,,4.0,6,


## Hierarchical Indexing

In [145]:
index = [('California', 2000), ('California', 2010),
 ('New York', 2000), ('New York', 2010),
 ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
 18976457, 19378102,
 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [146]:
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [147]:
index=pd.MultiIndex.from_tuples(index)

In [148]:
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [149]:
pop=pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [150]:
pop[:, (2000)]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [151]:
pop[pop > 22000000]

California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [152]:
pop.loc['California':'New York',2000]


California  2000    33871648
New York    2000    18976457
dtype: int64

In [153]:
pop.loc[:,2000]

California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [154]:
pop[['California', 'Texas']]

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex as extra dimension

In [155]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [156]:
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [157]:
pop_df = pd.DataFrame({'total': pop,
 'under18': [9267089, 9284094,
 4687374, 4318033,
 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [158]:
pop_df.unstack()

Unnamed: 0_level_0,total,total,under18,under18
Unnamed: 0_level_1,2000,2010,2000,2010
California,33871648,37253956,9267089,9284094
New York,18976457,19378102,4687374,4318033
Texas,20851820,25145561,5906301,6879014


In [159]:
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18)
f_u18.unstack()


California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64


Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


## Methods of MultiIndex Creation

In [160]:
df = pd.DataFrame(np.random.rand(4, 2),index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=['data1', 'data2'])
df


Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.200221,0.021821
a,2,0.136787,0.526751
b,1,0.549634,0.823635
b,2,0.645127,0.813493


In [161]:
data = {('California', 2000): 33871648,
 ('California', 2010): 37253956,
 ('Texas', 2000): 20851820,
 ('Texas', 2010): 25145561,
 ('New York', 2000): 18976457,
 ('New York', 2010): 19378102}
pd.Series(data)


California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

In [162]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [163]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [164]:
 pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [165]:
pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
 codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [166]:
pd.MultiIndex.from_frame(pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],['NJ', 'Temp'], ['NJ', 'Precip']]))

MultiIndex([('HI',   'Temp'),
            ('HI', 'Precip'),
            ('NJ',   'Temp'),
            ('NJ', 'Precip')],
           names=[0, 1])

### MultiIndex level names

In [167]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex for columns

In [168]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
 names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
 names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.0,45.0,38.7,25.0,37.4
2013,2,12.0,36.7,42.0,35.5,49.0,37.6
2014,1,14.0,36.9,28.0,36.6,54.0,37.3
2014,2,28.0,38.3,55.0,36.6,33.0,36.5


In [169]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,38.7
2013,2,42.0,35.5
2014,1,28.0,36.6
2014,2,55.0,36.6


## Indexing and Slicing a MultiIndex

### Multiply indexed Series

In [170]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [171]:
 pop['California']

year
2000    33871648
2010    37253956
dtype: int64

In [172]:
 pop['California', 2000]

33871648

In [173]:
pop['California':'New York']

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [174]:
pop[:,2000]

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

In [175]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [176]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [177]:
pop.loc['California',2000]

33871648

In [178]:
pop.loc['California':'New York',2000]

state       year
California  2000    33871648
New York    2000    18976457
dtype: int64

In [179]:
pop.loc[pop > 22000000,2000]

state       year
California  2000    33871648
dtype: int64

In [180]:
pop.loc[['California', 'Texas'],2000]

state       year
California  2000    33871648
Texas       2000    20851820
dtype: int64

In [181]:
pop.iloc[1]

37253956

In [182]:
pop.iloc[1:4]

state       year
California  2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [183]:
pop.iloc[[1,2]]

state       year
California  2010    37253956
New York    2000    18976457
dtype: int64

### Multiply indexed DataFrames

In [184]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.0,45.0,38.7,25.0,37.4
2013,2,12.0,36.7,42.0,35.5,49.0,37.6
2014,1,14.0,36.9,28.0,36.6,54.0,37.3
2014,2,28.0,38.3,55.0,36.6,33.0,36.5


In [185]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,45.0,38.7
2013,2,42.0,35.5
2014,1,28.0,36.6
2014,2,55.0,36.6


In [186]:
health_data['Guido', 'HR']

year  visit
2013  1        45.0
      2        42.0
2014  1        28.0
      2        55.0
Name: (Guido, HR), dtype: float64

In [187]:
health_data.iloc[:2, :2]


Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,40.0,37.0
2013,2,12.0,36.7


In [188]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        40.0
      2        12.0
2014  1        14.0
      2        28.0
Name: (Bob, HR), dtype: float64

In [189]:
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,40.0,45.0,25.0
2014,1,14.0,28.0,54.0


## Rearranging Multi-Indices

### Sorted and unsorted indices


In [190]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data


char  int
a     1      0.835341
      2      0.117137
c     1      0.594734
      2      0.474564
b     1      0.435327
      2      0.746242
dtype: float64

In [191]:
data = data.sort_index()
data


char  int
a     1      0.835341
      2      0.117137
b     1      0.435327
      2      0.746242
c     1      0.594734
      2      0.474564
dtype: float64

In [192]:
data['a':'b']

char  int
a     1      0.835341
      2      0.117137
b     1      0.435327
      2      0.746242
dtype: float64

### Stacking and unstacking indices

In [193]:
pop.unstack(level=0)

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [194]:
pop.unstack(level=1)

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [195]:
pop.unstack().stack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [196]:
pop.unstack(level=0).stack()

year  state     
2000  California    33871648
      New York      18976457
      Texas         20851820
2010  California    37253956
      New York      19378102
      Texas         25145561
dtype: int64

### Index setting and resetting

In [197]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [198]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


## Data Aggregations on Multi-Indices

In [199]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,40.0,37.0,45.0,38.7,25.0,37.4
2013,2,12.0,36.7,42.0,35.5,49.0,37.6
2014,1,14.0,36.9,28.0,36.6,54.0,37.3
2014,2,28.0,38.3,55.0,36.6,33.0,36.5


In [200]:
data_mean = health_data.mean(level='year')
data_mean


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,26.0,36.85,43.5,37.1,37.0,37.5
2014,21.0,37.6,41.5,36.6,43.5,36.9


In [201]:
data_mean = health_data.mean(level='subject',axis=1 )
data_mean


Unnamed: 0_level_0,subject,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,38.5,41.85,31.2
2013,2,24.35,38.75,43.3
2014,1,25.45,32.3,45.65
2014,2,33.15,45.8,34.75


In [202]:
data_mean = health_data.mean(level='year')
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,35.5,37.15
2014,35.333333,37.033333


## Combining Datasets: Concat and Append

In [203]:
data1=pd.DataFrame({"A":['A0',"A1"],"B":['B0','B1'],'C':['C0',"C1"]})
data2=pd.DataFrame({"C":['C0',"C1"],"D":['D0','D1']})
data1


Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1


In [204]:
data2

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [205]:
print(data1); print(data2)
merge=pd.concat([data1, data2] ,axis=0,ignore_index=True)

    A   B   C
0  A0  B0  C0
1  A1  B1  C1
    C   D
0  C0  D0
1  C1  D1


In [206]:
merge

Unnamed: 0,A,B,C,D
0,A0,B0,C0,
1,A1,B1,C1,
2,,,C0,D0
3,,,C1,D1


In [207]:
merge=pd.concat([data1, data2] ,axis=0,ignore_index=True,join='inner')

In [208]:
merge


Unnamed: 0,C
0,C0
1,C1
2,C0
3,C1


In [209]:
<h>hello</h>

SyntaxError: invalid syntax (<ipython-input-209-21ac6348b3db>, line 1)