In [1]:
import requests
import pandas as pd 
from secrets_config import api_key # https://home.openweathermap.org/ 

In [33]:
# how to extract data from multiple cities? e.g. canberra, sydney, etc

# 1. create a list of cities (csv)
# 2. read list of cities (csv) 
# 3. request data for each city (json) and push to a list 
# 4. convert list into dataframe 

In [34]:
# read list of cities
df_cities = pd.read_csv("data/australian_capital_cities.csv")
df_cities.head()

Unnamed: 0,city_name
0,canberra
1,sydney
2,darwin
3,brisbane
4,adelaide


In [35]:
# request data for each city (json) and push to a list 
weather_data = []
for city_name in df_cities["city_name"]:
    params = {
        "q": city_name,
        "units": "metric",
        "appid": api_key
    }
    response = requests.get(f"http://api.openweathermap.org/data/2.5/weather", params=params)
    if response.status_code == 200: 
        weather_data.append(response.json())
    else: 
        raise Exception("Extracting weather api data failed. Please check if API limits have been reached.")

In [37]:
# convert list into dataframe 
df_weather_cities = pd.json_normalize(weather_data)
df_weather_cities.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657723501,36000,2172517,Canberra,200,149.1281,-35.2835,...,1.03,190,7,1,9588,AU,1657746606,1657782482,,
1,"[{'id': 501, 'main': 'Rain', 'description': 'm...",stations,10000,1657723693,36000,2147714,Sydney,200,151.2073,-33.8679,...,10.8,220,75,2,2002865,AU,1657745905,1657782185,15.95,3.16
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657724072,34200,2073124,Darwin,200,130.8418,-12.4611,...,3.6,160,0,1,9574,AU,1657748326,1657789540,,
3,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657723912,36000,2174003,Brisbane,200,153.0281,-27.4679,...,5.66,250,0,2,2005393,AU,1657744634,1657782582,,
4,"[{'id': 803, 'main': 'Clouds', 'description': ...",stations,10000,1657723210,34200,2078025,Adelaide,200,138.6,-34.9333,...,2.57,140,75,2,2001763,AU,1657749082,1657785061,,


### Transforming data

In [38]:
# set city names to lowercase 
df_weather_cities["city_name"] = df_weather_cities["name"].str.lower()

In [39]:
df_population = pd.read_csv("data/australian_city_population.csv")
df_population.head()

Unnamed: 0,city_name,population
0,canberra,431611
1,sydney,5361466
2,darwin,146982
3,brisbane,2582007
4,adelaide,1378413


In [40]:
df_merged = pd.merge(left=df_weather_cities, right=df_population, on=["city_name"])
df_merged.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,wind.gust,rain.1h,city_name,population
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657723501,36000,2172517,Canberra,200,149.1281,-35.2835,...,7,1,9588,AU,1657746606,1657782482,,,canberra,431611
1,"[{'id': 501, 'main': 'Rain', 'description': 'm...",stations,10000,1657723693,36000,2147714,Sydney,200,151.2073,-33.8679,...,75,2,2002865,AU,1657745905,1657782185,15.95,3.16,sydney,5361466
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657724072,34200,2073124,Darwin,200,130.8418,-12.4611,...,0,1,9574,AU,1657748326,1657789540,,,darwin,146982
3,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657723912,36000,2174003,Brisbane,200,153.0281,-27.4679,...,0,2,2005393,AU,1657744634,1657782582,,,brisbane,2582007
4,"[{'id': 803, 'main': 'Clouds', 'description': ...",stations,10000,1657723210,34200,2078025,Adelaide,200,138.6,-34.9333,...,75,2,2001763,AU,1657749082,1657785061,,,adelaide,1378413


In [41]:
# print out all columns and pick only relevant ones 
df_merged.columns

Index(['weather', 'base', 'visibility', 'dt', 'timezone', 'id', 'name', 'cod',
       'coord.lon', 'coord.lat', 'main.temp', 'main.feels_like',
       'main.temp_min', 'main.temp_max', 'main.pressure', 'main.humidity',
       'wind.speed', 'wind.deg', 'clouds.all', 'sys.type', 'sys.id',
       'sys.country', 'sys.sunrise', 'sys.sunset', 'wind.gust', 'rain.1h',
       'city_name', 'population'],
      dtype='object')

In [42]:
df_selected = df_merged[["dt", "id", "name", "main.temp", "population"]] 
df_selected.head()

Unnamed: 0,dt,id,name,main.temp,population
0,1657723501,2172517,Canberra,1.83,431611
1,1657723693,2147714,Sydney,10.22,5361466
2,1657724072,2073124,Darwin,18.4,146982
3,1657723912,2174003,Brisbane,9.49,2582007
4,1657723210,2078025,Adelaide,8.58,1378413


In [43]:
# convert unix timestamp column to datetime 
df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")
df_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")


Unnamed: 0,dt,id,name,main.temp,population
0,2022-07-13 14:45:01,2172517,Canberra,1.83,431611
1,2022-07-13 14:48:13,2147714,Sydney,10.22,5361466
2,2022-07-13 14:54:32,2073124,Darwin,18.4,146982
3,2022-07-13 14:51:52,2174003,Brisbane,9.49,2582007
4,2022-07-13 14:40:10,2078025,Adelaide,8.58,1378413


In [44]:
# rename colum names to more meaningful names
df_selected = df_selected.rename(columns={
    "dt": "datetime",
    "main.temp": "temperature"
})
df_selected.head()

Unnamed: 0,datetime,id,name,temperature,population
0,2022-07-13 14:45:01,2172517,Canberra,1.83,431611
1,2022-07-13 14:48:13,2147714,Sydney,10.22,5361466
2,2022-07-13 14:54:32,2073124,Darwin,18.4,146982
3,2022-07-13 14:51:52,2174003,Brisbane,9.49,2582007
4,2022-07-13 14:40:10,2078025,Adelaide,8.58,1378413


### Aggregations and group bys 

In [45]:
# get average temperature of all cities 
df_selected["temperature"].mean()

12.29125

In [46]:
# get total population of all cities 
df_selected["population"].sum()

17376986

In [27]:
# what if we concat two snapshots of data? 

# first store df_selected in df_selected2
df_selected2 = df_selected
# then re-run the steps above to get a fresh df_selected, then only run the line below onwards 

In [48]:
df_concat = pd.concat([df_selected, df_selected2])
df_concat

Unnamed: 0,datetime,id,name,temperature,population
0,2022-07-13 14:45:01,2172517,Canberra,1.83,431611
1,2022-07-13 14:48:13,2147714,Sydney,10.22,5361466
2,2022-07-13 14:54:32,2073124,Darwin,18.4,146982
3,2022-07-13 14:51:52,2174003,Brisbane,9.49,2582007
4,2022-07-13 14:40:10,2078025,Adelaide,8.58,1378413
5,2022-07-13 14:42:05,2163355,Hobart,3.31,238375
6,2022-07-13 14:50:09,4163971,Melbourne,31.51,5096298
7,2022-07-13 14:50:05,2063523,Perth,14.99,2141834
0,2022-07-10 14:30:05,2172517,Canberra,1.37,431611
1,2022-07-10 14:29:55,2147714,Sydney,10.67,5361466


In [54]:
# get the average temperature for each city 
df_concat.groupby(["name"]).agg({
    "temperature":"mean"
}).reset_index()

Unnamed: 0,name,temperature
0,Adelaide,8.975
1,Brisbane,9.56
2,Canberra,1.6
3,Darwin,18.185
4,Hobart,5.08
5,Melbourne,30.875
6,Perth,13.19
7,Sydney,10.445
