In [1]:
import requests
import pandas as pd 
from secrets_config import api_key # https://home.openweathermap.org/ 

In [9]:
# how to extract data from multiple cities? e.g. canberra, sydney, etc

# 1. create a list of cities (csv)
# 2. read list of cities (csv) 
# 3. request data for each city (json) and push to a list 
# 4. convert list into dataframe 

In [2]:
# read list of cities
df_cities = pd.read_csv("data/australian_capital_cities.csv")
df_cities.head()

Unnamed: 0,city_name
0,canberra
1,sydney
2,darwin
3,brisbane
4,adelaide


In [3]:
# request data for each city (json) and push to a list 
weather_data = []
for city_name in df_cities["city_name"]:
    params = {
        "q": city_name,
        "units": "metric",
        "appid": api_key
    }
    response = requests.get(f"http://api.openweathermap.org/data/2.5/weather", params=params)
    if response.status_code == 200: 
        weather_data.append(response.json())
    else: 
        raise Exception("Extracting weather api data failed. Please check if API limits have been reached.")

In [5]:
# convert list into dataframe 
df_weather_cities = pd.json_normalize(weather_data)
df_weather_cities.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,rain.1h
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722339,36000,2172517,Canberra,200,149.1281,-35.2835,...,88,0.0,0,8,1,9588,AU,1657746606,1657782482,
1,"[{'id': 501, 'main': 'Rain', 'description': 'm...",stations,10000,1657722281,36000,2147714,Sydney,200,151.2073,-33.8679,...,76,10.29,210,100,2,2002865,AU,1657745905,1657782185,2.05
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722597,34200,2073124,Darwin,200,130.8418,-12.4611,...,36,3.09,160,0,1,9574,AU,1657661928,1657703125,
3,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722443,36000,2174003,Brisbane,200,153.0281,-27.4679,...,67,4.63,250,0,2,2005393,AU,1657744634,1657782582,
4,"[{'id': 803, 'main': 'Clouds', 'description': ...",stations,10000,1657722245,34200,2078025,Adelaide,200,138.6,-34.9333,...,82,2.57,140,75,2,2001763,AU,1657662704,1657698625,


### Transforming data

In [6]:
# set city names to lowercase 
df_weather_cities["city_name"] = df_weather_cities["name"].str.lower()

In [7]:
df_population = pd.read_csv("data/australian_city_population.csv")
df_population.head()

Unnamed: 0,city_name,population
0,canberra,431611
1,sydney,5361466
2,darwin,146982
3,brisbane,2582007
4,adelaide,1378413


In [8]:
df_merged = pd.merge(left=df_weather_cities, right=df_population, on=["city_name"])
df_merged.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,...,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,rain.1h,city_name,population
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722339,36000,2172517,Canberra,200,149.1281,-35.2835,...,0,8,1,9588,AU,1657746606,1657782482,,canberra,431611
1,"[{'id': 501, 'main': 'Rain', 'description': 'm...",stations,10000,1657722281,36000,2147714,Sydney,200,151.2073,-33.8679,...,210,100,2,2002865,AU,1657745905,1657782185,2.05,sydney,5361466
2,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722597,34200,2073124,Darwin,200,130.8418,-12.4611,...,160,0,1,9574,AU,1657661928,1657703125,,darwin,146982
3,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1657722443,36000,2174003,Brisbane,200,153.0281,-27.4679,...,250,0,2,2005393,AU,1657744634,1657782582,,brisbane,2582007
4,"[{'id': 803, 'main': 'Clouds', 'description': ...",stations,10000,1657722245,34200,2078025,Adelaide,200,138.6,-34.9333,...,140,75,2,2001763,AU,1657662704,1657698625,,adelaide,1378413


In [9]:
# print out all columns and pick only relevant ones 
df_merged.columns

Index(['weather', 'base', 'visibility', 'dt', 'timezone', 'id', 'name', 'cod',
       'coord.lon', 'coord.lat', 'main.temp', 'main.feels_like',
       'main.temp_min', 'main.temp_max', 'main.pressure', 'main.humidity',
       'wind.speed', 'wind.deg', 'clouds.all', 'sys.type', 'sys.id',
       'sys.country', 'sys.sunrise', 'sys.sunset', 'rain.1h', 'city_name',
       'population'],
      dtype='object')

In [10]:
df_selected = df_merged[["dt", "id", "name", "main.temp", "population"]] 
df_selected.head()

Unnamed: 0,dt,id,name,main.temp,population
0,1657722339,2172517,Canberra,1.08,431611
1,1657722281,2147714,Sydney,10.35,5361466
2,1657722597,2073124,Darwin,19.12,146982
3,1657722443,2174003,Brisbane,10.01,2582007
4,1657722245,2078025,Adelaide,9.09,1378413


In [11]:
df_selected.dtypes

dt              int64
id              int64
name           object
main.temp     float64
population      int64
dtype: object

In [12]:
df_selected["unique_id"] = df_selected["dt"].astype(str) + df_selected["id"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["unique_id"] = df_selected["dt"].astype(str) + df_selected["id"].astype(str)


In [13]:
# convert unix timestamp column to datetime 
df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")
df_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["dt"] = pd.to_datetime(df_selected["dt"], unit="s")


Unnamed: 0,dt,id,name,main.temp,population,unique_id
0,2022-07-13 14:25:39,2172517,Canberra,1.08,431611,16577223392172517
1,2022-07-13 14:24:41,2147714,Sydney,10.35,5361466,16577222812147714
2,2022-07-13 14:29:57,2073124,Darwin,19.12,146982,16577225972073124
3,2022-07-13 14:27:23,2174003,Brisbane,10.01,2582007,16577224432174003
4,2022-07-13 14:24:05,2078025,Adelaide,9.09,1378413,16577222452078025


In [14]:
# rename colum names to more meaningful names
df_selected = df_selected.rename(columns={
    "dt": "datetime",
    "main.temp": "temperature"
})
df_selected.head()

Unnamed: 0,datetime,id,name,temperature,population,unique_id
0,2022-07-13 14:25:39,2172517,Canberra,1.08,431611,16577223392172517
1,2022-07-13 14:24:41,2147714,Sydney,10.35,5361466,16577222812147714
2,2022-07-13 14:29:57,2073124,Darwin,19.12,146982,16577225972073124
3,2022-07-13 14:27:23,2174003,Brisbane,10.01,2582007,16577224432174003
4,2022-07-13 14:24:05,2078025,Adelaide,9.09,1378413,16577222452078025


In [15]:
df_selected = df_selected.set_index(["unique_id"])

### Load data to file (parquet)

Overwrite

Upsert

Append new files

### Load data to SQL 

Insert

Upsert