# Data Indexing and Selection

## Data Selection in Series - Part 1

How to access the data when Series acts as dictionary?

In [1]:
import pandas as pd

In [2]:
data = pd.Series([10, 20, 30, 40, 50], index = ['A', 'B', 'C', 'D', 'E'])
data

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [3]:
data['A']

10

In [4]:
'A' in data

True

In [5]:
pd.Series.keys?

In [6]:
data.keys()

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [7]:
pd.Series.items?

In [8]:
print(data.items());

<zip object at 0x7f94b19fed00>


In [9]:
print(list(data.items()))

[('A', 10), ('B', 20), ('C', 30), ('D', 40), ('E', 50)]


In [10]:
data['A'] = 45
data

A    45
B    20
C    30
D    40
E    50
dtype: int64

## Data Selection in Series - Part 2

#### How to access the data when Series acts as one-dimensional array?

In [11]:
print(data)
print(data['A':'D'])

A    45
B    20
C    30
D    40
E    50
dtype: int64
A    45
B    20
C    30
D    40
dtype: int64


In [12]:
print(data)
print(data[0:4])

A    45
B    20
C    30
D    40
E    50
dtype: int64
A    45
B    20
C    30
D    40
dtype: int64


In [13]:
data[(data > 20) & (data < 50)]

A    45
C    30
D    40
dtype: int64

In [14]:
data[['A', 'E']]

A    45
E    50
dtype: int64

## Indexers loc and iloc

In [15]:
data

A    45
B    20
C    30
D    40
E    50
dtype: int64

In [16]:
data[1:3]

B    20
C    30
dtype: int64

In [17]:
data['B':'D']

B    20
C    30
D    40
dtype: int64

In [18]:
idata = pd.Series(['a', 'b', 'c', 'd', 'e', 'f'], index=[1, 3, 4, 5, 6, 7])
idata

1    a
3    b
4    c
5    d
6    e
7    f
dtype: object

In [19]:
idata[1]

'a'

In [20]:
idata[1:4]

3    b
4    c
5    d
dtype: object

In [21]:
pd.Series.loc?

In [22]:
idata.loc[1]

'a'

In [23]:
print(idata)
idata.loc[1:4]

1    a
3    b
4    c
5    d
6    e
7    f
dtype: object


1    a
3    b
4    c
dtype: object

In [24]:
idata.loc[1:6]

1    a
3    b
4    c
5    d
6    e
dtype: object

In [25]:
print(idata)
print(idata[3:6])        # implicit index without loc : also final index is excluded
print(idata.loc[3:6])    # explicit index with loc : also final index is included

1    a
3    b
4    c
5    d
6    e
7    f
dtype: object
5    d
6    e
7    f
dtype: object
3    b
4    c
5    d
6    e
dtype: object


In [26]:
pd.Series.iloc?

In [27]:
print(idata)
print(idata.iloc[1])     # implicit
print()
print(idata[1])          # this is explicit index which is equivalent to idata.loc[1]
print()
print(idata.loc[1])      # explicit

1    a
3    b
4    c
5    d
6    e
7    f
dtype: object
b

a

a


In [28]:
idata.iloc[1:4]

3    b
4    c
5    d
dtype: object

In [29]:
print(idata)
print(idata[3:6])         # implicit index without loc : also final index is excluded
print(idata.iloc[3:6])    # iloc works like implicit index so iloc is meant for implicit index preservation

1    a
3    b
4    c
5    d
6    e
7    f
dtype: object
5    d
6    e
7    f
dtype: object
5    d
6    e
7    f
dtype: object


In [30]:
# Without loc
# Make a Note: --> # explicit index when indexing
#              --> # implicit index when slicing
# With loc
# --> # The 'loc' attribute allows indexing and slicing that always references the explicit index.
# With iloc
# --> # The 'iloc' attribute on the other hand allows indexing and slicing that always references implicit Python-style index

## Data Selection in DataFrame

#### DataFrame as a dictionary

In [31]:
import pandas as pd

In [32]:
states_capitals = {'Karnataka':'Bangalore', 'Andrapradesh':'Hyderabad', 'Tamilnadu':'Chennai',
                   'Keral':'Thiruvanathapuram', 'Maharastra':'Mumbai'}

In [33]:
states_lang = {'Karnataka':'Kannada', 'Andrapradesh':'Telugu', 'Tamilnadu':'Tamil', 'Kerala':'Malayalam',
              'Maharastra':'hindi', 'Panjab':'Panjabi'}

In [34]:
data = pd.DataFrame({'capitals': states_capitals, 'language': states_lang})
data

Unnamed: 0,capitals,language
Karnataka,Bangalore,Kannada
Andrapradesh,Hyderabad,Telugu
Tamilnadu,Chennai,Tamil
Keral,Thiruvanathapuram,
Maharastra,Mumbai,hindi
Kerala,,Malayalam
Panjab,,Panjabi


In [35]:
data['capitals']

Karnataka               Bangalore
Andrapradesh            Hyderabad
Tamilnadu                 Chennai
Keral           Thiruvanathapuram
Maharastra                 Mumbai
Kerala                        NaN
Panjab                        NaN
Name: capitals, dtype: object

In [36]:
data.capitals

Karnataka               Bangalore
Andrapradesh            Hyderabad
Tamilnadu                 Chennai
Keral           Thiruvanathapuram
Maharastra                 Mumbai
Kerala                        NaN
Panjab                        NaN
Name: capitals, dtype: object

In [37]:
data['capitals'] is data.capitals

True

In [38]:
s_sub = pd.Series({'Pruthvi': 'Kannada', 'Pranam': 'Hindi', 'Pratham':'English', 'Pravera':'Maths', 'Prabu':'Science'})
total_m = pd.Series({'Pruthvi': 60, 'Pranam': 60, 'Pratham':60, 'Pravera':60, 'Prabu':60})
minf_m = pd.Series({'Pruthvi': 30, 'Pranam': 30, 'Pratham':30, 'Pravera':30, 'Prabu':30})
obt_m = pd.Series({'Pruthvi': 25, 'Pranam': 35, 'Pratham':40, 'Pravera':60, 'Prabu':55})

students_d = pd.DataFrame({'Sub' : s_sub, 'T_m' : total_m, 'Min_m' : minf_m, 'O_m' : obt_m})
students_d

Unnamed: 0,Sub,T_m,Min_m,O_m
Pruthvi,Kannada,60,30,25
Pranam,Hindi,60,30,35
Pratham,English,60,30,40
Pravera,Maths,60,30,60
Prabu,Science,60,30,55


In [39]:
students_d['score'] = students_d['O_m'] / students_d['T_m']
students_d

Unnamed: 0,Sub,T_m,Min_m,O_m,score
Pruthvi,Kannada,60,30,25,0.416667
Pranam,Hindi,60,30,35,0.583333
Pratham,English,60,30,40,0.666667
Pravera,Maths,60,30,60,1.0
Prabu,Science,60,30,55,0.916667


In [40]:
students_d['mp_score'] = students_d['Min_m'] / students_d['T_m']
students_d

Unnamed: 0,Sub,T_m,Min_m,O_m,score,mp_score
Pruthvi,Kannada,60,30,25,0.416667,0.5
Pranam,Hindi,60,30,35,0.583333,0.5
Pratham,English,60,30,40,0.666667,0.5
Pravera,Maths,60,30,60,1.0,0.5
Prabu,Science,60,30,55,0.916667,0.5


#### DataFrame as two-dimensional array

In [41]:
print(students_d)
students_d.values

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


array([['Kannada', 60, 30, 25, 0.4166666666666667, 0.5],
       ['Hindi', 60, 30, 35, 0.5833333333333334, 0.5],
       ['English', 60, 30, 40, 0.6666666666666666, 0.5],
       ['Maths', 60, 30, 60, 1.0, 0.5],
       ['Science', 60, 30, 55, 0.9166666666666666, 0.5]], dtype=object)

In [42]:
pd.DataFrame.T?

In [43]:
students_d.T

Unnamed: 0,Pruthvi,Pranam,Pratham,Pravera,Prabu
Sub,Kannada,Hindi,English,Maths,Science
T_m,60,60,60,60,60
Min_m,30,30,30,30,30
O_m,25,35,40,60,55
score,0.416667,0.583333,0.666667,1.0,0.916667
mp_score,0.5,0.5,0.5,0.5,0.5


In [44]:
students_d.values[0]

array(['Kannada', 60, 30, 25, 0.4166666666666667, 0.5], dtype=object)

In [45]:
students_d.values[1]

array(['Hindi', 60, 30, 35, 0.5833333333333334, 0.5], dtype=object)

In [46]:
students_d['score']

Pruthvi    0.416667
Pranam     0.583333
Pratham    0.666667
Pravera    1.000000
Prabu      0.916667
Name: score, dtype: float64

In [47]:
students_d['T_m']

Pruthvi    60
Pranam     60
Pratham    60
Pravera    60
Prabu      60
Name: T_m, dtype: int64

In [48]:
students_d

Unnamed: 0,Sub,T_m,Min_m,O_m,score,mp_score
Pruthvi,Kannada,60,30,25,0.416667,0.5
Pranam,Hindi,60,30,35,0.583333,0.5
Pratham,English,60,30,40,0.666667,0.5
Pravera,Maths,60,30,60,1.0,0.5
Prabu,Science,60,30,55,0.916667,0.5


In [49]:
students_d[1:3]

Unnamed: 0,Sub,T_m,Min_m,O_m,score,mp_score
Pranam,Hindi,60,30,35,0.583333,0.5
Pratham,English,60,30,40,0.666667,0.5


In [50]:
students_d['Pranam':'Pravera']

Unnamed: 0,Sub,T_m,Min_m,O_m,score,mp_score
Pranam,Hindi,60,30,35,0.583333,0.5
Pratham,English,60,30,40,0.666667,0.5
Pravera,Maths,60,30,60,1.0,0.5


#### Accessing DF Objects Using loc, iloc, and ix

In [51]:
 pd.DataFrame.loc?

In [52]:
pd.DataFrame.iloc?

In [53]:
 pd.DataFrame.ix?

Object `pd.DataFrame.ix` not found.


In [54]:
print(students_d)

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


In [55]:
students_d.iloc[0:3, 0:2]

Unnamed: 0,Sub,T_m
Pruthvi,Kannada,60
Pranam,Hindi,60
Pratham,English,60


In [56]:
print(students_d)
students_d.iloc[2:5, 2:]

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


Unnamed: 0,Min_m,O_m,score,mp_score
Pratham,30,40,0.666667,0.5
Pravera,30,60,1.0,0.5
Prabu,30,55,0.916667,0.5


In [57]:
print(students_d)
students_d.loc[:, :]

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


Unnamed: 0,Sub,T_m,Min_m,O_m,score,mp_score
Pruthvi,Kannada,60,30,25,0.416667,0.5
Pranam,Hindi,60,30,35,0.583333,0.5
Pratham,English,60,30,40,0.666667,0.5
Pravera,Maths,60,30,60,1.0,0.5
Prabu,Science,60,30,55,0.916667,0.5


In [58]:
print(students_d)
students_d.loc[:'Pravera', 'O_m':]

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


Unnamed: 0,O_m,score,mp_score
Pruthvi,25,0.416667,0.5
Pranam,35,0.583333,0.5
Pratham,40,0.666667,0.5
Pravera,60,1.0,0.5


In [59]:
print(students_d)
students_d.loc['Pranam':'Pravera', 'T_m':'score']

             Sub  T_m  Min_m  O_m     score  mp_score
Pruthvi  Kannada   60     30   25  0.416667       0.5
Pranam     Hindi   60     30   35  0.583333       0.5
Pratham  English   60     30   40  0.666667       0.5
Pravera    Maths   60     30   60  1.000000       0.5
Prabu    Science   60     30   55  0.916667       0.5


Unnamed: 0,T_m,Min_m,O_m,score
Pranam,60,30,35,0.583333
Pratham,60,30,40,0.666667
Pravera,60,30,60,1.0


In [60]:
students_d.ix[:3, 'T_m':'score']

AttributeError: ignored

In [None]:
students_d.loc[students_d['O_m'] > 30, ['Min_m', 'O_m']]

In [None]:
print(students_d)
students_d.iloc[0, 3] = 30

In [None]:
students_d['score'] = students_d['O_m'] / students_d['T_m']
print(students_d)