In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('./Data/weather.csv').head()
df

Unnamed: 0,MONTH,DAY,TIME,TEMP,PRESSURE
0,1,1,1,6.8,10207
1,1,1,2,5.8,10214
2,1,1,3,5.7,10220
3,1,1,4,6.0,10225
4,1,1,5,4.5,10230


In [2]:
df['TEMP']

0    6.8
1    5.8
2    5.7
3    6.0
4    4.5
Name: TEMP, dtype: float64

In [3]:
df['TEMP'][1]

5.8

In [4]:
dft = df.T
dft

Unnamed: 0,0,1,2,3,4
MONTH,1.0,1.0,1.0,1.0,1.0
DAY,1.0,1.0,1.0,1.0,1.0
TIME,1.0,2.0,3.0,4.0,5.0
TEMP,6.8,5.8,5.7,6.0,4.5
PRESSURE,10207.0,10214.0,10220.0,10225.0,10230.0


In [5]:
dft[2]['TIME']

3.0

In [6]:
dft[2][2]

3.0

In [7]:
t = pd.DataFrame([['John'],['Bob'], ['Anne']], index=[4,3,4])
t

Unnamed: 0,0
4,John
3,Bob
4,Anne


In [8]:
t[0][4]

4    John
4    Anne
Name: 0, dtype: object

In [9]:
type(t[0][4])

pandas.core.series.Series

In [10]:
df['TIME'][[3,1,4]]

3    4
1    2
4    5
Name: TIME, dtype: int64

In [11]:
# Slices only operate on rows
df[2:4]

Unnamed: 0,MONTH,DAY,TIME,TEMP,PRESSURE
2,1,1,3,5.7,10220
3,1,1,4,6.0,10225


In [12]:
df[2:4]['TIME']

2    3
3    4
Name: TIME, dtype: int64

In [28]:
df1 = df[2:4]['TIME'][2]

df1

3

In [29]:
type(df1)

numpy.int64

In [30]:
dft

Unnamed: 0,0,1,2,3,4
MONTH,1.0,1.0,1.0,1.0,1.0
DAY,1.0,1.0,1.0,1.0,1.0
TIME,1.0,2.0,3.0,4.0,5.0
TEMP,6.8,5.8,5.7,6.0,4.5
PRESSURE,10207.0,10214.0,10220.0,10225.0,10230.0


In [31]:
dft[:2]

Unnamed: 0,0,1,2,3,4
MONTH,1.0,1.0,1.0,1.0,1.0
DAY,1.0,1.0,1.0,1.0,1.0


In [32]:
dft['TIME':'PRESSURE']

Unnamed: 0,0,1,2,3,4
TIME,1.0,2.0,3.0,4.0,5.0
TEMP,6.8,5.8,5.7,6.0,4.5
PRESSURE,10207.0,10214.0,10220.0,10225.0,10230.0


<h1>Using loc and iloc</h1>

In [33]:
capitals = pd.DataFrame(
    [
    ["Ngerulmud",391,1.87],
    ["Vatican City",826,100],
    ["Yaren",1100,10.91],
    ["Funafuti",4492,45.48],
    ["City of San Marino",4493]
    ], 
    index = ["Palau", "Vatican City", "Nauru", "Tuvalu", "San Marino"],
    columns=['Capital', 'Population', 'Percentage'])

In [34]:
capitals

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Nauru,Yaren,1100,10.91
Tuvalu,Funafuti,4492,45.48
San Marino,City of San Marino,4493,


In [35]:
# loc allows row based indexing, as opposed to column based indexing,
# which we have seen in the previous clips

# before, unless using slices, the column had to be specified first, e.g.:
capitals['Capital']['Nauru']

'Yaren'

In [36]:
# using iloc, can specify row based indexing:
capitals.loc['Nauru']

Capital       Yaren
Population     1100
Percentage    10.91
Name: Nauru, dtype: object

In [37]:
# Now can specify the index of both the row and the column in the format:
# dataframe.loc[<row identifier>, <col identifier>]
capitals.loc['Nauru', 'Population']

1100

In [38]:
# The old way of doing it took a chained operation retrieving first the column
# of the dataset, then then row of the dataset
capitals['Population']['Nauru']

1100

In [39]:
# So the new way only takes one operation; this is why the use of loc is
# usually preferred over chained indexing

In [40]:
# **Both** arguments of loc also support lists and slices
# Note: slices are not a list.
capitals.loc['Palau':'Nauru', ['Population', 'Percentage']]

Unnamed: 0,Population,Percentage
Palau,391,1.87
Vatican City,826,100.0
Nauru,1100,10.91


In [41]:
# Can also use a list as the first argument, and a slice as the second:
capitals.loc[['Palau', 'Vatican City', 'Nauru'], 'Population':'Percentage']

Unnamed: 0,Population,Percentage
Palau,391,1.87
Vatican City,826,100.0
Nauru,1100,10.91


In [42]:
# Can leave the column specifier out completely:
capitals.loc[['Palau', 'Vatican City', 'Nauru']]

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Nauru,Yaren,1100,10.91


In [43]:
# loc only indexes by label, not by position.

# To retrieve items by position, use iloc; this uses integers as the
# row specifier. The following will return rows 4 and 1:
capitals.iloc[[4,1]]

Unnamed: 0,Capital,Population,Percentage
San Marino,City of San Marino,4493,
Vatican City,Vatican City,826,100.0


In [44]:
# Can add a second argument to specify columns, but note that iloc is
# strictly location based:
capitals.iloc[[4,1], 1:]

Unnamed: 0,Population,Percentage
San Marino,4493,
Vatican City,826,100.0


In [45]:
# iloc allows you to select a column by position:
# Here, can select all rows, and column in position 2:
capitals.iloc[:,2]

Palau             1.87
Vatican City    100.00
Nauru            10.91
Tuvalu           45.48
San Marino         NaN
Name: Percentage, dtype: float64

<h1>Boolean Filtering</h1>

In [46]:
# Before considering loc and iloc, it was shown that passing a list to the
# index opertator of our dataframe, we can select columns from our data:
capitals[['Capital', 'Population']]

Unnamed: 0,Capital,Population
Palau,Ngerulmud,391
Vatican City,Vatican City,826
Nauru,Yaren,1100
Tuvalu,Funafuti,4492
San Marino,City of San Marino,4493


In [47]:
# But look what happens if a list of booleans is passed to the index operator.
# The list needs to have as many rows as there are data.
# This then selects only the **rows** that are flagged as true:
capitals[[True, True, False, True, False]]

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48


In [48]:
# This is useful because we can generate large lists of booleans easily to
# select subsets of rows of data:
# Start with this; select the 'Percentage' column, which returs a series:
capitals['Percentage']

Palau             1.87
Vatican City    100.00
Nauru            10.91
Tuvalu           45.48
San Marino         NaN
Name: Percentage, dtype: float64

In [49]:
type(capitals['Percentage'])

pandas.core.series.Series

In [51]:
capitals['Percentage'] > 25

Palau           False
Vatican City     True
Nauru           False
Tuvalu           True
San Marino      False
Name: Percentage, dtype: bool

In [52]:
type(capitals['Percentage'] > 25)

pandas.core.series.Series

In [53]:
# Because this expression returns a Series, it can be used in an indexing
# operation:

capitals[capitals['Percentage'] > 25]

Unnamed: 0,Capital,Population,Percentage
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48


In [54]:
# New dataframe example:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                       index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                       columns = ['test_1', 'test_2'])
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,8
Ann,6,7
Pete,6,5
Laura,5,2


In [55]:
# Select students whose score didn't improve after the first test.
# Two columns can be compared against each other:

grades['test_2'] <= grades['test_1']

Mary      True
John     False
Ann      False
Pete      True
Laura     True
dtype: bool

In [56]:
type(grades['test_2'] <= grades['test_1'])

pandas.core.series.Series

In [57]:
# The boolean series returned can be used in an indexing operation,
# to return in this case a dataframe of the students that did not improve:

grades[grades['test_2'] <= grades['test_1']]

Unnamed: 0,test_1,test_2
Mary,6,4
Pete,6,5
Laura,5,2


In [58]:
type(grades[grades['test_2'] <= grades['test_1']])

pandas.core.frame.DataFrame

In [59]:
# Can be used with loc and iloc.
# Example, view only the tests that have an average over 5.5.
# To do this, need to create a list of booleans that have the same length as
# the number of columns in my data.

In [60]:
# First, calculate the averages, which returns a series:
grades.mean()

test_1    6.0
test_2    5.2
dtype: float64

In [61]:
type(grades.mean())

pandas.core.series.Series

In [62]:
# Now compare this to 5.5 to create a boolean series:
grades.mean() > 5.5

test_1     True
test_2    False
dtype: bool

In [63]:
type(grades.mean() > 5.5)

pandas.core.series.Series

In [64]:
# This can then be used in loc, like this:
grades.loc[:, grades.mean() > 5.5]

# This can be interpreted as: 'Show me all of the rows with columns that have
# a mean > 5.5':

Unnamed: 0,test_1
Mary,6
John,7
Ann,6
Pete,6
Laura,5


<h1>Assigning values</h1>

In [65]:
# Almost all of the indexing operations return a view on the original 
# DataFrame, and you can assign values to that.

In [66]:
# E.g. select the grades for Laura and John for test 2:
grades.loc[['Laura', 'John'], 'test_2']

Laura    2
John     8
Name: test_2, dtype: int64

In [67]:
# Now their scores can both be incremented by 1 in the following way:
grades.loc[['Laura', 'John'], 'test_2'] += 1

In [68]:
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,9
Ann,6,7
Pete,6,5
Laura,5,3


In [69]:
# It is also possible to upgrade an entire column at a time:
grades['test_1'] += .5
grades

Unnamed: 0,test_1,test_2
Mary,6.5,4
John,7.5,9
Ann,6.5,7
Pete,6.5,5
Laura,5.5,3


In [70]:
# Or for all rows for a specific student:
grades.loc['Mary'] += 2
grades

Unnamed: 0,test_1,test_2
Mary,8.5,6.0
John,7.5,9.0
Ann,6.5,7.0
Pete,6.5,5.0
Laura,5.5,3.0


In [71]:
# These above operations all operate on the original data in the DataFrame.

In [72]:
# It is also possible to update with multiple values, by passing in a list of
# multiple values:

grades.loc['Pete'] = [7,8]
grades

Unnamed: 0,test_1,test_2
Mary,8.5,6.0
John,7.5,9.0
Ann,6.5,7.0
Pete,7.0,8.0
Laura,5.5,3.0


In [73]:
# Convert in place grades to 'Fail' or 'Pass':
failing = grades < 6
failing

Unnamed: 0,test_1,test_2
Mary,False,False
John,False,False
Ann,False,False
Pete,False,False
Laura,True,True


In [74]:
passing = grades >= 6
passing

Unnamed: 0,test_1,test_2
Mary,True,True
John,True,True
Ann,True,True
Pete,True,True
Laura,False,False


In [75]:
type(passing)

pandas.core.frame.DataFrame

In [76]:
type(failing)

pandas.core.frame.DataFrame

In [77]:
grades[failing]

Unnamed: 0,test_1,test_2
Mary,,
John,,
Ann,,
Pete,,
Laura,5.5,3.0


In [78]:
grades[passing]

Unnamed: 0,test_1,test_2
Mary,8.5,6.0
John,7.5,9.0
Ann,6.5,7.0
Pete,7.0,8.0
Laura,,


In [79]:
grades[failing] = "Fail"
grades

Unnamed: 0,test_1,test_2
Mary,8.5,6
John,7.5,9
Ann,6.5,7
Pete,7,8
Laura,Fail,Fail


In [80]:
grades[passing] = "Pass"
grades

Unnamed: 0,test_1,test_2
Mary,Pass,Pass
John,Pass,Pass
Ann,Pass,Pass
Pete,Pass,Pass
Laura,Fail,Fail


In [81]:
# Recreate the orginal data:
# New dataframe example:
grades = pd.DataFrame([[6, 4], [7, 8], [6, 7], [6, 5], [5, 2]], 
                       index = ['Mary', 'John', 'Ann', 'Pete', 'Laura'],
                       columns = ['test_1', 'test_2'])
grades

Unnamed: 0,test_1,test_2
Mary,6,4
John,7,8
Ann,6,7
Pete,6,5
Laura,5,2


In [82]:
# Scenario: take average for each test, so we want an average by row.
# But the mean method by default, takes averages by column.
# the mean method can average over the rows by setting an additional parameter
# axis=1:
grades.mean(axis=1)

Mary     5.0
John     7.5
Ann      6.5
Pete     5.5
Laura    3.5
dtype: float64

In [83]:
# Now convert to boolean by comparing each value to 6:
grades.mean(axis=1) > 6

Mary     False
John      True
Ann       True
Pete     False
Laura    False
dtype: bool

In [84]:
# This can be added as a new column to the existing DataFrame by adding 
# the name of a new column for an index:
grades['passed'] = grades.mean(axis=1) > 6

In [85]:
grades

Unnamed: 0,test_1,test_2,passed
Mary,6,4,False
John,7,8,True
Ann,6,7,True
Pete,6,5,False
Laura,5,2,False


<h1>Sorting</h1>

In [86]:
# sort_index() - sorts by index:
capitals.sort_index()

Unnamed: 0,Capital,Population,Percentage
Nauru,Yaren,1100,10.91
Palau,Ngerulmud,391,1.87
San Marino,City of San Marino,4493,
Tuvalu,Funafuti,4492,45.48
Vatican City,Vatican City,826,100.0


In [87]:
# By default, sort_index() does not change the original data frame:
capitals

Unnamed: 0,Capital,Population,Percentage
Palau,Ngerulmud,391,1.87
Vatican City,Vatican City,826,100.0
Nauru,Yaren,1100,10.91
Tuvalu,Funafuti,4492,45.48
San Marino,City of San Marino,4493,


In [88]:
# To sort the original data, use the flag inplace=True:
capitals.sort_index(inplace=True)
capitals

Unnamed: 0,Capital,Population,Percentage
Nauru,Yaren,1100,10.91
Palau,Ngerulmud,391,1.87
San Marino,City of San Marino,4493,
Tuvalu,Funafuti,4492,45.48
Vatican City,Vatican City,826,100.0


In [89]:
# You can do a reverse sort using the flag ascending=False:
capitals.sort_index(inplace=True, ascending=False)
capitals

Unnamed: 0,Capital,Population,Percentage
Vatican City,Vatican City,826,100.0
Tuvalu,Funafuti,4492,45.48
San Marino,City of San Marino,4493,
Palau,Ngerulmud,391,1.87
Nauru,Yaren,1100,10.91


In [90]:
#Can also sort by the column index using the argument axis = 1:
capitals.sort_index(inplace=True, axis=1)
capitals

Unnamed: 0,Capital,Percentage,Population
Vatican City,Vatican City,100.0,826
Tuvalu,Funafuti,45.48,4492
San Marino,City of San Marino,,4493
Palau,Ngerulmud,1.87,391
Nauru,Yaren,10.91,1100


In [92]:
# Use sort_values() to sort by values, specifying by which column to sort by:
capitals.sort_values(by='Percentage', inplace=True, ascending=True)
capitals

Unnamed: 0,Capital,Percentage,Population
Palau,Ngerulmud,1.87,391
Nauru,Yaren,10.91,1100
Tuvalu,Funafuti,45.48,4492
Vatican City,Vatican City,100.0,826
San Marino,City of San Marino,,4493


In [98]:
# Can sort by a list of columns:
grades.sort_values(by=['test_1', 'test_2'],inplace=True, ascending=False)
grades

Unnamed: 0,test_1,test_2,passed
John,7,8,True
Ann,6,7,True
Pete,6,5,False
Mary,6,4,False
Laura,5,2,False
