### A Revision to Pandas DataFrames


In [1]:
import pandas as pd
import numpy as np

In [2]:
nested_list = [["Brasil", 'Brasilia', 8.516, 200.40],
              ['Russia', 'Moscow', 17.100, 143.50],
              ['India', 'New Dehli', 3.286, 1252.00],
              ['China', 'Beijing', 9.597, 1357.00],
              ['South Africa', 'Pretoria', 1.221, 52.98]]

df = pd.DataFrame(data=nested_list,
                       columns=['Country', 'Capital', 'Area', 'Population'],
                       index = ['BR', 'RU', 'IN', 'CH', 'SA']
                      )

df

Unnamed: 0,Country,Capital,Area,Population
BR,Brasil,Brasilia,8.516,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [3]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [4]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-2.143664,-0.838227
G1,2,-0.825386,-0.530053
G1,3,-0.241616,1.233855
G2,1,-0.920968,1.112945
G2,2,0.068487,0.583979
G2,3,1.507705,-0.256063


In [5]:
df.index.names

FrozenList([None, None])

In [7]:
df.index.names = ['Group','Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-2.143664,-0.838227
G1,2,-0.825386,-0.530053
G1,3,-0.241616,1.233855
G2,1,-0.920968,1.112945
G2,2,0.068487,0.583979
G2,3,1.507705,-0.256063


In [8]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-2.143664,-0.838227
2,-0.825386,-0.530053
3,-0.241616,1.233855


In [11]:
type(df['Country'])
# Pandas series that shows the countries

pandas.core.series.Series

In [7]:
type(df[['Country']])

pandas.core.frame.DataFrame

In [8]:
df[['Country', 'Population']]

Unnamed: 0,Country,Population
BR,Brasil,200.4
RU,Russia,143.5
IN,India,1252.0
CH,China,1357.0
SA,South Africa,52.98


In [15]:
df[0:1]

Unnamed: 0,Country,Capital,Area,Population
BR,Brasil,Brasilia,8.516,200.4


In [17]:
df[3:4]

Unnamed: 0,Country,Capital,Area,Population
CH,China,Beijing,9.597,1357.0


#### Selecting data with loc and iloc

In [18]:
df.loc['IN']

Country           India
Capital       New Dehli
Area              3.286
Population         1252
Name: IN, dtype: object

In [32]:
df.loc[['IN', 'RU']]

Unnamed: 0,Country,Capital,Area,Population
IN,India,New Dehli,3.286,1252.0
RU,Russia,Moscow,17.1,143.5


In [24]:
df.iloc[[4]]

Unnamed: 0,Country,Capital,Area,Population
SA,South Africa,Pretoria,1.221,52.98


In [27]:
df.loc[['BR', 'RU'], ['Capital', 'Population']]

Unnamed: 0,Capital,Population
BR,Brasilia,200.4
RU,Moscow,143.5


In [36]:
is_large = df['Population'] > 200
is_large

BR     True
RU    False
IN     True
CH     True
SA    False
Name: Population, dtype: bool

In [37]:
df[is_large]

Unnamed: 0,Country,Capital,Area,Population
BR,Brasil,Brasilia,8.516,200.4
IN,India,New Dehli,3.286,1252.0
CH,China,Beijing,9.597,1357.0


#### Pandas logical operators

In [38]:
is_huge = df['Population'] > 200 and df['Area'] > 9
is_huge

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [39]:
is_huge = np.logical_and(df['Population'] > 200, df['Area'] > 9)
is_huge

BR    False
RU    False
IN    False
CH     True
SA    False
dtype: bool

In [40]:
df[is_huge]

Unnamed: 0,Country,Capital,Area,Population
CH,China,Beijing,9.597,1357.0


In [48]:
df[np.logical_and(df['Population'] < 1000, df['Area'] < 15)]

Unnamed: 0,Country,Capital,Area,Population
BR,Brasil,Brasilia,8.516,200.4
SA,South Africa,Pretoria,1.221,52.98
