In [None]:
### Author: WeilainMuchen

In [None]:
### Explicit indexes

In [None]:
## Setting & removing indexes
'''
· Look at temperatures.
· Set the index of temperatures to "city", assigning to temperatures_ind.
· Look at temperatures_ind. How is it different from temperatures?
· Reset the index of temperatures_ind, keeping its contents.
· Reset the index of temperatures_ind, dropping its contents.
'''
# Look at temperatures
print(temperatures)

# Index temperatures by city
temperatures_ind = temperatures.set_index("city")

# Look at temperatures_ind
print(temperatures_ind)

# Reset the index, keeping its contents
print(temperatures_ind.reset_index())

# Reset the index, dropping its contents
print(temperatures_ind.reset_index(drop=True))

In [None]:
## Subsetting with .loc[]
'''
· Create a list of cities to subset on: Moscow and Saint Petersburg. Assign to cities.
· Use [] subsetting to filter temperatures for rows where the city column takes a value in cities.
· Use .loc[] subsetting to filter temperatures_ind for rows where the city is in cities.
'''
# Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]

# Subset temperatures using square brackets
print(temperatures[temperatures["city"].isin(cities)])

# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])

In [None]:
## Setting multi-level indexes
'''
· Set the index of temperatures to the "country" and "city" columns, and assign this to temperatures_ind.
· Specify two country/city pairs to keep: "Brazil"/"Rio De Janeiro" and "Pakistan"/"Lahore", assigning to rows_to_keep.
· Print and subset temperatures_ind for rows_to_keep using .loc[].
'''
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country", "city"])

# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil", "Rio De Janeiro"), ("Pakistan", "Lahore")]

# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

In [None]:
## Sorting by index values
'''
· Sort temperatures_ind by the index values.
· Sort temperatures_ind by the index values at the "city" level.
· Sort temperatures_ind by ascending country then descending city.
'''
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())

# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level="city"))

# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country", "city"], ascending = [True, False]))

In [None]:
### Slicing and subsetting with .loc and .iloc

In [None]:
## Slicing index values
'''
· Sort the index of temperatures_ind.
· Use slicing with .loc[] to get these subsets:
  · from Pakistan to Russia.
  · from Lahore to Moscow. (This will return nonsense.)
  · from Pakistan, Lahore to Russia, Moscow.
'''
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()

# Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan":"Russia"])

# Try to subset rows from Lahore to Moscow
print(temperatures_srt.loc["Lahore":"Moscow"])

# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[("Pakistan", "Lahore"):("Russia", "Moscow")])

In [None]:
## Slicing in both directions
'''
· Use .loc[] slicing to subset rows from India, Hyderabad to Iraq, Baghdad.
· Use .loc[] slicing to subset columns from date to avg_temp_c.
· Slice in both directions at once from Hyderabad to Baghdad, and date to avg_temp_c.
'''
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad")])

# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:, "date":"avg_temp_c"])

# Subset in both directions at once
print(temperatures_srt.loc[("India", "Hyderabad"):("Iraq", "Baghdad"), "date":"avg_temp_c"])

In [None]:
## Slicing time series
'''
· Use Boolean conditions (not .isin() or .loc[]) to subset for rows in 2010 and 2011, and print the results.
· Note that because the date isn't set as an index, a condition that contains only a year, such as df["date"] == "2009", will check if the date is equal to the first day of the first month of the year (e.g. 2009-01-01), rather than checking whether the date occurs within the given year. We recommend writing out the full date when using Boolean conditions (e.g. 2009-12-31).
  · Set the index to the date column.
· Use .loc[] to subset for rows in 2010 and 2011.
· Use .loc[] to subset for rows from Aug 2010 to Feb 2011.
'''
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures["date"] >= "2010-01-01") & (temperatures["date"] <= "2011-12-31")]
print(temperatures_bool)

# Set date as an index
temperatures_ind = temperatures.set_index("date")

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010":"2011"])

# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08":"2011-02"])

In [None]:
## Subsetting by row/column number
'''
Use .iloc[] on temperatures to take subsets.
· Get the 23rd row, 2nd column (index positions 22 and 1).
· Get the first 5 rows (index positions 0 to 5).
· Get all rows, columns 3 and 4 (index positions 2 to 4).
· Get the first 5 rows, columns 3 and 4.
'''
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22, 1])

# Use slicing to get the first 5 rows
print(temperatures.iloc[:5])

# Use slicing to get columns 3 to 4
print(temperatures.iloc[:, 2:4])

# Use slicing in both directions at once
print(temperatures.iloc[:5, 2:4])

In [None]:
### Working with pivot tables

In [None]:
## Pivot temperature by city and year
'''
· Add a year column to temperatures, from the year component of the date column.
· Make a pivot table of the avg_temp_c column, with country and city as rows, and year as columns. Assign to temp_by_country_city_vs_year, and look at the result.
'''
# Add a year column to temperatures
temperatures["year"] = temperatures["date"].dt.year

# Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = ["country", "city"], columns = "year")

# See the result
print(temp_by_country_city_vs_year)

In [None]:
## Subsetting pivot tables
'''
Use .loc[] on temp_by_country_city_vs_year to take subsets.
· From Egypt to India.
· From Egypt, Cairo to India, Delhi.
· From Egypt, Cairo to India, Delhi and 2005 to 2010.
'''
# Subset for Egypt to India
temp_by_country_city_vs_year.loc["Egypt":"India"]

# Subset for Egypt, Cairo to India, Delhi
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi")]

# Subset in both directions at once
temp_by_country_city_vs_year.loc[("Egypt", "Cairo"):("India", "Delhi"), "2005":"2010"]

In [None]:
## Calculating on a pivot table
'''
· Calculate the mean temperature for each year, assigning to mean_temp_by_year.
· Filter mean_temp_by_year for the year that had the highest mean temperature.
· Calculate the mean temperature for each city (across columns), assigning to mean_temp_by_city.
· Filter mean_temp_by_city for the city that had the lowest mean temperature.
'''
# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()

# Find the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])

# Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")

# Find the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])