In [2]:
import pandas as pd
import requests
from pytz import timezone
from datetime import datetime
import sqlalchemy
# from Keys import MySQL_pass
# from Keys import OW_API_key

## Read the "cities" table from the database into the notebook

1. With the OpenWeather API we need to use the name of the city in the url. We will use the "cities" table from our database to loop over the "City_name" column and thus implement them (one at a time) in the url.

2. Since the weather dataframe (wich will contain the data we will get from the openweather API) will be related to the "cities" table in the database we need to get the corresponding city id from the cities table and implement it in the weather one (will become a foreign key here). This will be done during the for loop from above.

In [5]:
schema = "gans_local" # The name of your database
host = "127.0.0.1"
user = "root"
password = "PedregalNuria7178" # Your MySQL password
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [7]:
cities_df = pd.read_sql("cities", con=connection_string)
cities_df

Unnamed: 0,City_id,City_name,Country
0,1,Berlin,Germany
1,2,Hamburg,Germany
2,3,Munich,Germany


## Create a weather dataframe with info for all cities in the database

In [10]:
berlin_timezone = timezone('Europe/Berlin')
API_key = "6dfcec3fec6bb7c3265fb6c07bb8e4c1"
weather_items = []

In [12]:
# For loop over the "City_name" column in the "cities_df"

for i in cities_df["City_name"]:
    print(i)

Berlin
Hamburg
Munich


In [14]:
# Extracting the "City_id" from the cities table. This will be implemented in the for loop.
cities_df.loc[cities_df["City_name"] == "Hamburg", "City_id"].values[0]

2

In [None]:
# Slightly different way to extract the "City_id"
#cities_df.loc[cities_df["City_name"] == "Hamburg", "City_id"].to_list()[0]

2

In [16]:
for city in cities_df["City_name"]:
    url = (f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={API_key}&units=metric")
    response = requests.get(url)
    json = response.json()
    # As we are now using the data from our relational database
    # the city should reflect the city_id and not the city name
    city_id = cities_df.loc[cities_df["City_name"] == city, "City_id"].values[0]

    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

    for item in json["list"]:
        weather_item = {
            "city_id": city_id,
            "forecast_time": item.get("dt_txt", None),
            "outlook": item["weather"][0].get("description", None),
            "temperature": item["main"].get("temp", None),
            "feels_like": item["main"].get("feels_like", None),
            "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item["wind"].get("speed", None),
            "rain_prob": item.get("pop", None),
            "data_retrieved_at": retrieval_time
        }

        weather_items.append(weather_item)

In [18]:
weather_df = pd.DataFrame(weather_items)

In [20]:
weather_df

Unnamed: 0,city_id,forecast_time,outlook,temperature,feels_like,rain_in_last_3h,wind_speed,rain_prob,data_retrieved_at
0,1,2024-06-10 09:00:00,clear sky,15.98,15.07,0.00,3.61,0.0,2024-06-10 09:36:21
1,1,2024-06-10 12:00:00,few clouds,17.08,16.12,0.00,3.54,0.0,2024-06-10 09:36:21
2,1,2024-06-10 15:00:00,broken clouds,16.97,16.00,0.00,1.63,0.0,2024-06-10 09:36:21
3,1,2024-06-10 18:00:00,overcast clouds,15.75,14.97,0.00,2.02,0.0,2024-06-10 09:36:21
4,1,2024-06-10 21:00:00,light rain,13.80,13.32,1.04,0.35,1.0,2024-06-10 09:36:21
...,...,...,...,...,...,...,...,...,...
115,3,2024-06-14 18:00:00,light rain,17.65,17.53,0.23,0.72,0.2,2024-06-10 09:36:21
116,3,2024-06-14 21:00:00,overcast clouds,15.62,15.40,0.00,2.33,0.0,2024-06-10 09:36:21
117,3,2024-06-15 00:00:00,overcast clouds,14.17,13.68,0.00,1.97,0.0,2024-06-10 09:36:21
118,3,2024-06-15 03:00:00,overcast clouds,12.99,12.46,0.00,2.13,0.0,2024-06-10 09:36:21


In [22]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city_id            120 non-null    int64  
 1   forecast_time      120 non-null    object 
 2   outlook            120 non-null    object 
 3   temperature        120 non-null    float64
 4   feels_like         120 non-null    float64
 5   rain_in_last_3h    120 non-null    float64
 6   wind_speed         120 non-null    float64
 7   rain_prob          120 non-null    float64
 8   data_retrieved_at  120 non-null    object 
dtypes: float64(5), int64(1), object(3)
memory usage: 8.6+ KB


In [24]:
weather_df["forecast_time"] = pd.to_datetime(weather_df["forecast_time"])
weather_df["data_retrieved_at"] = pd.to_datetime(weather_df["data_retrieved_at"])

In [26]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   city_id            120 non-null    int64         
 1   forecast_time      120 non-null    datetime64[ns]
 2   outlook            120 non-null    object        
 3   temperature        120 non-null    float64       
 4   feels_like         120 non-null    float64       
 5   rain_in_last_3h    120 non-null    float64       
 6   wind_speed         120 non-null    float64       
 7   rain_prob          120 non-null    float64       
 8   data_retrieved_at  120 non-null    datetime64[ns]
dtypes: datetime64[ns](2), float64(5), int64(1), object(1)
memory usage: 8.6+ KB


## # Create "weather" table in the database

```sql
CREATE TABLE weather (
	weather_id INT AUTO_INCREMENT,
    city_id INT NOT NULL,
    forecast_time DATETIME,
    outlook VARCHAR(255),
    temperature FLOAT,
    feels_like FLOAT,
    rain_in_last_3h FLOAT,
    wind_speed FLOAT,
    rain_prob FLOAT,
    data_retrieved_at DATETIME,
    PRIMARY KEY (weather_id),
    FOREIGN KEY (city_id) REFERENCES cities(city_id)
);
```

## Push the "weather_df" to the empty "weather" table in the database

In [28]:

weather_df.to_sql('weather',
                  if_exists='append',
                  con=connection_string,
                  index=False)

120

# Making our function work with our SQL cities data
This section delves into a more advanced aspect of the project, which may be beyond the point many of you got to. However, it serves as an illustrative example of how to structure your work effectively. It demonstrates how we can compartmentalise various aspects of our data processing pipeline into distinct functions, enhancing code readability and facilitating debugging. It's a good idea to start writing your pipeline in a similar manner ready for the cloud.
> This part will only work locally, not on colab. You must also create your weather table in SQL first.

In [30]:
import pandas as pd
import requests
from pytz import timezone
from datetime import datetime
# from Keys import MySQL_pass
# from Keys import OW_API_key

def retreiving_and_sending_data():
  connection_string = connection()
  cities_df = get_cities_data(connection_string)
  weather_df = get_weather_data(cities_df)
  send_weather_data(weather_df, connection_string)
  return "Data has been updated"

def connection():
  schema = "gans_local" # The name of your database
  host = "127.0.0.1"
  user = "root"
  password = "PedregalNuria7178"
  port = 3306
  return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

def get_cities_data(connection_string):
  return pd.read_sql("cities", con=connection_string)

def get_weather_data(cities_df):
  berlin_timezone = timezone('Europe/Berlin')
  API_key = "6dfcec3fec6bb7c3265fb6c07bb8e4c1"
  weather_items = []

  for city in cities_df["City_name"]:
    url = (f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={API_key}&units=metric")
    response = requests.get(url)
    json = response.json()

    # As we are now using the data from our relational database
    # the city should reflect the city_id and not the city name
    city_id = cities_df.loc[cities_df["City_name"] == city, "City_id"].values[0]

    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

    for item in json["list"]:
        weather_item = {
            "city_id": city_id,
            "forecast_time": item.get("dt_txt", None),
            "outlook": item["weather"][0].get("description", None),
            "temperature": item["main"].get("temp", None),
            "feels_like": item["main"].get("feels_like", None),
            "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item["wind"].get("speed", None),
            "rain_prob": item.get("pop", None),
            "data_retrieved_at": retrieval_time
        }

        weather_items.append(weather_item)

  weather_df = pd.DataFrame(weather_items)
  weather_df["forecast_time"] = pd.to_datetime(weather_df["forecast_time"])
  weather_df["data_retrieved_at"] = pd.to_datetime(weather_df["data_retrieved_at"])

  return weather_df

def send_weather_data(weather_df, connection_string):
  weather_df.to_sql('weather',
                    if_exists='append',
                    con=connection_string,
                    index=False)

In [32]:
retreiving_and_sending_data()

'Data has been updated'

In [36]:
pd.read_sql("weather", con=connection_string)

Unnamed: 0,weather_id,city_id,forecast_time,outlook,temperature,feels_like,rain_in_last_3h,wind_speed,rain_prob,data_retrieved_at
0,1,1,2024-06-10 09:00:00,clear sky,15.98,15.07,0.00,3.61,0.0,2024-06-10 09:36:21
1,2,1,2024-06-10 12:00:00,few clouds,17.08,16.12,0.00,3.54,0.0,2024-06-10 09:36:21
2,3,1,2024-06-10 15:00:00,broken clouds,16.97,16.00,0.00,1.63,0.0,2024-06-10 09:36:21
3,4,1,2024-06-10 18:00:00,overcast clouds,15.75,14.97,0.00,2.02,0.0,2024-06-10 09:36:21
4,5,1,2024-06-10 21:00:00,light rain,13.80,13.32,1.04,0.35,1.0,2024-06-10 09:36:21
...,...,...,...,...,...,...,...,...,...,...
235,236,3,2024-06-14 18:00:00,light rain,17.65,17.53,0.23,0.72,0.2,2024-06-10 09:37:44
236,237,3,2024-06-14 21:00:00,overcast clouds,15.62,15.40,0.00,2.33,0.0,2024-06-10 09:37:44
237,238,3,2024-06-15 00:00:00,overcast clouds,14.17,13.68,0.00,1.97,0.0,2024-06-10 09:37:44
238,239,3,2024-06-15 03:00:00,overcast clouds,12.99,12.46,0.00,2.13,0.0,2024-06-10 09:37:44


In [4]:
pip install functions-framework

Collecting functions-framework
  Downloading functions_framework-3.7.0-py3-none-any.whl.metadata (16 kB)
Collecting cloudevents<2.0.0,>=1.2.0 (from functions-framework)
  Downloading cloudevents-1.10.1-py3-none-any.whl.metadata (6.9 kB)
Collecting deprecation<3.0,>=2.0 (from cloudevents<2.0.0,>=1.2.0->functions-framework)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Downloading functions_framework-3.7.0-py3-none-any.whl (35 kB)
Downloading cloudevents-1.10.1-py3-none-any.whl (55 kB)
   ---------------------------------------- 0.0/55.0 kB ? eta -:--:--
   ---------------------------------------- 55.0/55.0 kB 3.0 MB/s eta 0:00:00
Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: deprecation, cloudevents, functions-framework
Successfully installed cloudevents-1.10.1 deprecation-2.1.0 functions-framework-3.7.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
functions-framework --target=weather_update --debug

SyntaxError: cannot assign to expression here. Maybe you meant '==' instead of '='? (2802470956.py, line 1)