In [17]:
import requests
import json
from datetime import datetime
import os
import pandas as pd

In [20]:
api_key = "70c930c88aa878d0bbca83d431eb101f"
def get_raw_weather_data(api_key, cities):
    url = "https://api.openweathermap.org/data/2.5/weather"

    weather_data = {}
    for city in cities:
        params = {"q": city, "appid": api_key, "units": "metric"}
        response = requests.get(url, params=params)
        data = json.loads(response.text)
        weather_data[city] = data

    filename = datetime.now().strftime("%Y-%m-%d %H:%M") + ".json"
    filepath = os.path.join("./raw_files", filename)
    with open(filepath, "w") as f:
        json.dump(weather_data, f)
    print("Weather data saved to", filepath)

In [22]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor()
model.__class__.__name__

'HistGradientBoostingRegressor'

In [23]:
scores = {
    'GBM': .05,
    'Linear Regression': -0.2,
    'Decision Tree': -1
}
winner = max(scores, key=scores.get)

In [24]:
winner

'GBM'

In [21]:
cities = ['paris', 'brussels', 'amsterdam']
get_raw_weather_data(api_key, cities)


Weather data saved to ./raw_files/2023-07-19 18:46:19.json


In [8]:
dfs = []
for f in files:
    with open(os.path.join(parent_folder, f), 'r') as file:
        data_temp = json.load(file)

In [9]:
data_temp.keys()

dict_keys(['Paris', 'Brussels', 'Amsterdam'])

In [10]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Fix test_size to 2 with 12 samples
X = np.random.randn(20, 2)
y = np.random.randint(0, 2, 20)
tscv = TimeSeriesSplit(n_splits=3, test_size=None)
for i, (train_index, test_index) in enumerate(tscv.split(X)):
     print(f"Fold {i}:")
     print(f"  Train: index={train_index}")
     print(f"  Test:  index={test_index}")

Fold 0:
  Train: index=[0 1 2 3 4]
  Test:  index=[5 6 7 8 9]
Fold 1:
  Train: index=[0 1 2 3 4 5 6 7 8 9]
  Test:  index=[10 11 12 13 14]
Fold 2:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
  Test:  index=[15 16 17 18 19]


In [11]:
parent_folder = './raw_files'
files = sorted(os.listdir(parent_folder), reverse=True)

dfs = []
for file_name in files:
    with open(os.path.join(parent_folder, file_name), 'r') as file:
        data = json.load(file)
    for city, city_data in data.items():
        dfs.append({
            'temperature': city_data['main']['temp'],
            'city': city_data['name'],
            'pressure': city_data['main']['pressure'],
            'date': file_name.split('.')[0]
        })

df = pd.DataFrame(dfs)
print(df.head(10))
# df.to_csv(os.path.join('./clean_data', filename), index=False)

   temperature       city  pressure              date
0        24.35      Paris      1018  2023-07-19 14:52
1        21.59   Brussels      1015  2023-07-19 14:52
2        20.85  Amsterdam      1013  2023-07-19 14:52
3        24.40      Paris      1018  2023-07-19 14:51
4        21.46   Brussels      1015  2023-07-19 14:51
5        20.85  Amsterdam      1013  2023-07-19 14:51
6        24.40      Paris      1018  2023-07-19 14:50
7        21.46   Brussels      1015  2023-07-19 14:50
8        20.87  Amsterdam      1013  2023-07-19 14:50
9        24.35      Paris      1018  2023-07-19 14:49


In [12]:
parent_folder = './raw_files'
files = sorted(os.listdir(parent_folder), reverse=True)

dfs = []
for f in files:
    with open(os.path.join(parent_folder, f), 'r') as file:
        data_temp = json.load(file)
    for city in data_temp.keys():
        data_city = data_temp[city]
        dfs.append(
            {
                'temperature': data_city['main']['temp'],
                'city': data_city['name'],
                'pression': data_city['main']['pressure'],
                'date': f.split('.')[0]
            }
        )
df = pd.DataFrame(dfs)
print('\n', df.head(10))
# df.to_csv(os.path.join('./clean_data', filename), index=False)


    temperature       city  pression              date
0        24.35      Paris      1018  2023-07-19 14:52
1        21.59   Brussels      1015  2023-07-19 14:52
2        20.85  Amsterdam      1013  2023-07-19 14:52
3        24.40      Paris      1018  2023-07-19 14:51
4        21.46   Brussels      1015  2023-07-19 14:51
5        20.85  Amsterdam      1013  2023-07-19 14:51
6        24.40      Paris      1018  2023-07-19 14:50
7        21.46   Brussels      1015  2023-07-19 14:50
8        20.87  Amsterdam      1013  2023-07-19 14:50
9        24.35      Paris      1018  2023-07-19 14:49


In [14]:
def transform_data_into_csv(n_files=None, filename='data.csv'):
    parent_folder = './raw_files'
    files = sorted(os.listdir(parent_folder), reverse=True)
    if n_files:
        files = files[:n_files]

    dfs = []
    for file_name in files:
        with open(os.path.join(parent_folder, file_name), 'r') as file:
            data = json.load(file)
        for city, city_data in data.items():
            dfs.append({
                'temperature': city_data['main']['temp'],
                'city': city_data['name'],
                'pressure': city_data['main']['pressure'],
                'date': file_name.split('.')[0]
            })

    df = pd.DataFrame(dfs)
    print(df.head(10))

    df.to_csv(os.path.join('./clean_data', filename), index=False)
    
transform_data_into_csv(n_files=None, filename='data.csv')

   temperature       city  pressure              date
0        24.35      Paris      1018  2023-07-19 14:52
1        21.59   Brussels      1015  2023-07-19 14:52
2        20.85  Amsterdam      1013  2023-07-19 14:52
3        24.40      Paris      1018  2023-07-19 14:51
4        21.46   Brussels      1015  2023-07-19 14:51
5        20.85  Amsterdam      1013  2023-07-19 14:51
6        24.40      Paris      1018  2023-07-19 14:50
7        21.46   Brussels      1015  2023-07-19 14:50
8        20.87  Amsterdam      1013  2023-07-19 14:50
9        24.35      Paris      1018  2023-07-19 14:49


In [None]:
for i in range(180):
    get_raw_weather_data("70c930c88aa878d0bbca83d431eb101f")
    time.sleep(60)