# DataFrame: change index, multi-index, index-hierarchy

In [2]:
import numpy as np
import pandas as pd

# retun random numbers from "standared NORMAL distribution" centered around 0
from numpy.random import randn

np.random.seed(101)
rnd_20 = randn(5, 4)

fd = pd.DataFrame(data=rnd_20, index=["r1", "r2", "r3", "r4", "r5"], columns=["c1", "c2", "c3", "c4"])

## ------------    changing index    ------------
### reset_index

In [2]:
fd

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r3,-2.018168,0.740122,0.528813,-0.589001
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509


In [3]:
# resetting index to default
fd.reset_index()
# notice the old-index "r1", "r2", "r3", "r4", "r5" moved to a column
# now the actual index reset 0,1,2,3,4

Unnamed: 0,index,c1,c2,c3,c4
0,r1,2.70685,0.628133,0.907969,0.503826
1,r2,0.651118,-0.319318,-0.848077,0.605965
2,r3,-2.018168,0.740122,0.528813,-0.589001
3,r4,0.188695,-0.758872,-0.933237,0.955057
4,r5,0.190794,1.978757,2.605967,0.683509


In [4]:
# Note that: it doesn't occurs "inplace"
    # to make the change, use "inplace"
fd
# use
# fd.reset_index(inplace=True)

Unnamed: 0,c1,c2,c3,c4
r1,2.70685,0.628133,0.907969,0.503826
r2,0.651118,-0.319318,-0.848077,0.605965
r3,-2.018168,0.740122,0.528813,-0.589001
r4,0.188695,-0.758872,-0.933237,0.955057
r5,0.190794,1.978757,2.605967,0.683509


### set_index

In [1]:
# Awesome trick to create a list
    # calling split() on a string!!
    # don't need to typ ',' or ""
newind = "CA NY WY OR CO".split()   # split on a blank space
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [3]:
# we insert this "newind" to our DataFrame,
    # notice the dimension must match
fd['States'] = newind
fd

Unnamed: 0,c1,c2,c3,c4,States
r1,2.70685,0.628133,0.907969,0.503826,CA
r2,0.651118,-0.319318,-0.848077,0.605965,NY
r3,-2.018168,0.740122,0.528813,-0.589001,WY
r4,0.188695,-0.758872,-0.933237,0.955057,OR
r5,0.190794,1.978757,2.605967,0.683509,CO


#### column as index: setting a column as index
    # instead of resetting we want the column "States" to be the index of our DataFrame
    # use set_index() instead of reset_index()


In [4]:
fd.set_index("States")  # overrides the old-index
# note: we need to apply 'inpalce'
# also we cannot retain information from the old-index (as in reset_index)

Unnamed: 0_level_0,c1,c2,c3,c4
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


___

## --------    multi-index    --------

In [1]:
# multi-index DataFrame
import numpy as np
import pandas as pd

# two lists
# "Gl G1 G1 G2 G2 G2".split()
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]

In [2]:
# create a "list of tuples / pairs"
hier_index = list(zip(outside,inside))  # zip is used to create pair (a, b)
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [3]:
# creating "Multi-index"
# "pd.Multiindex.from_tuples" creates a multi-index from a "list of tuples"
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [4]:
# creating a Dataframe using above Multi-Index
df = pd.DataFrame(np.random.randn(6, 2), index=hier_index, columns=['A', 'B'])
df
# np.random.randn(6, 2) creates a 6x2 matrix of 12 random numbers
# you'll notice the 2-level of index
    # 1st level [' G1', 'G2']
    # 2nd leevl [1, 2, 3] for each G1 and G2
    # so followwing is called: "Multi-level index" or "Index-Hierarchy"

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.56466,0.109311
G1,2,0.602715,-0.517064
G1,3,0.955557,-0.475207
G2,1,-0.50663,0.614177
G2,2,0.544845,0.74083
G2,3,0.212075,1.434821


### Accessing data

In [6]:
# working with a multi-index DataFrame:
    # Accessing data from a multi-level indexed DataFrame
    # we'll use row-selection first
    # call most-outside index and proceed deeper

# Grab most-outside index, we'll get a sub-DataFrame
df.loc['G1']

Unnamed: 0,A,B
1,1.56466,0.109311
2,0.602715,-0.517064
3,0.955557,-0.475207


In [17]:
# then we use next level of index
df.loc['G1'].loc[1]     # returns the first row (labeled '1') of 'G1' as a Series

A    1.904176
B   -0.513356
Name: 1, dtype: float64

In [5]:
# accessing a single element, using both raw-index and column-index
df.loc['G2'].loc[2]['B']
# row.row.column, no '.' is used for column i.e. no attribute used

0.7408299983432391

### naming multi-level index

In [7]:
df.index.names  # notice no names
df.index.names  = ["Groups", "Num"]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.56466,0.109311
G1,2,0.602715,-0.517064
G1,3,0.955557,-0.475207
G2,1,-0.50663,0.614177
G2,2,0.544845,0.74083
G2,3,0.212075,1.434821


### ------    cross-section (xs)    ------

In [8]:
# returns cross-section of row(s) or column(s) in multi-indexed dataframe
# everything under 'G1'
df.loc['G1']    # using 'loc'

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.56466,0.109311
2,0.602715,-0.517064
3,0.955557,-0.475207


In [9]:
df.xs('G1')     # using 'xs', also notice '()' used

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.56466,0.109311
2,0.602715,-0.517064
3,0.955557,-0.475207


#### xs can skip levels to get deep into multi-level-index
    # 'xs' mostly used data-visualization projects, to grab stacked-indexed data
    # lets grab "both Num=1 indexs" from both 'G1' and 'G2' groups
    # it's little bit treacky using 'loc'

In [10]:
df.xs(1, level="Num")
# above gets all "1 indexes" from index-level named "Num"

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.56466,0.109311
G2,-0.50663,0.614177


___

## ------------    missing data    ------------
    There are methods that can deal with missing data
    
    # pandas wwill autometically fill the missing data with "NaN" value. 
    
    # we can do either:
        # drop the NaN 
        # fill the NaN 

In [1]:
import numpy as np
import pandas as pd

# create a DataFrame
# just like we could create a "Series" from a "Dict", 
    # we can create a "DataFrame" from a "Dict" as wwell
    # the keys 'A', 'B', 'C' will be the columns
    # the points will be "list" of values for "each key"
    # we'll use "np.nan" to make a missing value
dCt = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}

# To create a DataFrame we just pass the dictionary as a argumnet to pd.DataFrame()
df2 = pd.DataFrame(dCt)
# notice 1st-row nad 3rd-column has no missing values
df2 

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


### ----  dropna()  ----

In [3]:
# drops any rows and columns that contains 1 or more NaN values
# axis = 0, row-wise is default
df2.dropna()    # drops rows

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [4]:
# drop columns
df2.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


### threshold for non-Na values

In [5]:
# it can also specify a "threshold" for non-NA values, if we set the parameter "thresh"
    # "thresh = 2" keeps all rows / columns that has "at least 2" non-Na values
df2.dropna(thresh=2)    # keep rows, that has "at least 2" non-NaN values

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [7]:
df2.dropna(thresh=3)    # keep rows, that has "at least 1" non-NaN values

Unnamed: 0,A,B,C
0,1.0,5.0,1



### ----  fillna()  ----
    # fills the NaN with a given value


In [2]:
# following fills with a string
df2.fillna("howdy")

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,howdy,2
2,howdy,howdy,3


In [4]:
df2.fillna(value = "filled data")

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,filled data,2
2,filled data,filled data,3


In [6]:
# following fills with a given value
df2.fillna(9.45)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,9.45,2
2,9.45,9.45,3


#### following fills with the mean of the column
However, we can use more appropriate statistical method to fill the missing values rather than "mean"

In [5]:
# select a column, then use "fillna" with mean of that column
df2["A"].fillna(value = df2["A"].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64