In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('/home/saumitra/Desktop/kaggle/parks.csv',index_col='Park Code')

In [3]:
df.head()

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACAD,Acadia National Park,ME,47390,44.35,-68.21
ARCH,Arches National Park,UT,76519,38.68,-109.57
BADL,Badlands National Park,SD,242756,43.75,-102.5
BIBE,Big Bend National Park,TX,801163,29.25,-103.25
BISC,Biscayne National Park,FL,172924,25.65,-80.08


**Indexing rows**

In [4]:
#The simplest way to access a row is to pass the row number to the .iloc method. Note that first row is zero,
#just like list indexes.
df.iloc[2]

Park Name    Badlands National Park
State                            SD
Acres                        242756
Latitude                      43.75
Longitude                    -102.5
Name: BADL, dtype: object

In [5]:
#The other main approach is to pass a value from your dataframe's index to the .loc method:
df.loc['BADL']

Park Name    Badlands National Park
State                            SD
Acres                        242756
Latitude                      43.75
Longitude                    -102.5
Name: BADL, dtype: object

In [6]:
#Indexing multiple rows
df.loc[['BADL', 'ARCH', 'ACAD']]

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BADL,Badlands National Park,SD,242756,43.75,-102.5
ARCH,Arches National Park,UT,76519,38.68,-109.57
ACAD,Acadia National Park,ME,47390,44.35,-68.21


In [7]:
df.iloc[[2,1,0]]

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BADL,Badlands National Park,SD,242756,43.75,-102.5
ARCH,Arches National Park,UT,76519,38.68,-109.57
ACAD,Acadia National Park,ME,47390,44.35,-68.21


In [8]:
df.iloc[4:10]

Unnamed: 0_level_0,Park Name,State,Acres,Latitude,Longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BISC,Biscayne National Park,FL,172924,25.65,-80.08
BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72
BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
CANY,Canyonlands National Park,UT,337598,38.2,-109.93
CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44


**Indexing columns**

In [9]:
df['State'].head(3)

Park Code
ACAD    ME
ARCH    UT
BADL    SD
Name: State, dtype: object

You can also access a single column as if it were an attribute of the dataframe, but only if the name has no 
spaces, uses only basic characters, and doesn't share a name with a dataframe method. So, df.State works:

In [10]:
df.State.head(3)

Park Code
ACAD    ME
ARCH    UT
BADL    SD
Name: State, dtype: object

In [11]:
#but Park Code fails
df.Park Code.head(3)

SyntaxError: invalid syntax (<ipython-input-11-50f813f3f9f8>, line 2)

It's a good practice to clean your column names to prevent this sort of error. I'll use a very short cleaning
function here since the names don't have any odd characters. By convention, the names should also be converted 
to lower case. Pandas is case sensitive, so future calls to all of the columns will need to be updated.

In [12]:
df.columns = [col.replace(' ' , '_').lower() for col in df.columns]
df.columns

Index(['park_name', 'state', 'acres', 'latitude', 'longitude'], dtype='object')

In [13]:
#indexing columns and rows
df[['state','acres']][3:7]

Unnamed: 0_level_0,state,acres
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1
BIBE,TX,801163
BISC,FL,172924
BLCA,CO,32950
BRCA,UT,35835


indexing scalar values

In [14]:
df.state.iloc[[2]]

Park Code
BADL    SD
Name: state, dtype: object

In [15]:
df.state.iloc[2]

'SD'

subsetting

In [16]:
(df.state=='UT').head(5)

Park Code
ACAD    False
ARCH     True
BADL    False
BIBE    False
BISC    False
Name: state, dtype: bool

In [17]:
df[(df.state=='UT')]

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARCH,Arches National Park,UT,76519,38.68,-109.57
BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
CANY,Canyonlands National Park,UT,337598,38.2,-109.93
CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
ZION,Zion National Park,UT,146598,37.3,-113.05


Some of the logical operators are different:

1. ~ replaces not
2. | replaces or
3. & replaces and
. If you have multiple arguments they'll need to be wrapped in parentheses. For example:

In [18]:
df[(df.latitude>30) | (df.acres>10**6)].head(5)

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACAD,Acadia National Park,ME,47390,44.35,-68.21
ARCH,Arches National Park,UT,76519,38.68,-109.57
BADL,Badlands National Park,SD,242756,43.75,-102.5
BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72
BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18


In [19]:
#can use complicated expressions including lambdas
df[df.park_name.str.split().apply(lambda x: len(x)==3)].head(3)

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACAD,Acadia National Park,ME,47390,44.35,-68.21
ARCH,Arches National Park,UT,76519,38.68,-109.57
BADL,Badlands National Park,SD,242756,43.75,-102.5


#Key Companion Methods: isin and isnull
#These methods make it much easier and faster to perform some very common tasks. Suppose we wanted to find all parks
#on the West coast. isin makes that simple:

In [20]:
df[df.state.isin(['WA','OR','CA'])]

Unnamed: 0_level_0,park_name,state,acres,latitude,longitude
Park Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CHIS,Channel Islands National Park,CA,249561,34.01,-119.42
CRLA,Crater Lake National Park,OR,183224,42.94,-122.1
JOTR,Joshua Tree National Park,CA,789745,33.79,-115.9
LAVO,Lassen Volcanic National Park,CA,106372,40.49,-121.51
MORA,Mount Rainier National Park,WA,235625,46.85,-121.75
NOCA,North Cascades National Park,WA,504781,48.7,-121.2
OLYM,Olympic National Park,WA,922651,47.97,-123.5
PINN,Pinnacles National Park,CA,26606,36.48,-121.16
REDW,Redwood National Park,CA,112512,41.3,-124.0
SEKI,Sequoia and Kings Canyon National Parks,CA,865952,36.43,-118.68


#.at and .iat: like .loc and .iloc but much faster in exchange for only working on a single column and only returning a single result.
#.get: like .loc, but will return a default value if the key doesn't exist in the index. Only works on a single column/series.
#.take: equivalent to .iloc, but can operate on either rows or columns.
#.query takes up a query like in SQL