In [43]:
import pandas as pd
import numpy as np

# Реальные средние температуры (примерные данные) для городов по сезонам
seasonal_temperatures = {
    "New York": {"winter": 0, "spring": 10, "summer": 25, "autumn": 15},
    "London": {"winter": 5, "spring": 11, "summer": 18, "autumn": 12},
    "Paris": {"winter": 4, "spring": 12, "summer": 20, "autumn": 13},
    "Tokyo": {"winter": 6, "spring": 15, "summer": 27, "autumn": 18},
    "Moscow": {"winter": -10, "spring": 5, "summer": 18, "autumn": 8},
    "Sydney": {"winter": 12, "spring": 18, "summer": 25, "autumn": 20},
    "Berlin": {"winter": 0, "spring": 10, "summer": 20, "autumn": 11},
    "Beijing": {"winter": -2, "spring": 13, "summer": 27, "autumn": 16},
    "Rio de Janeiro": {"winter": 20, "spring": 25, "summer": 30, "autumn": 25},
    "Dubai": {"winter": 20, "spring": 30, "summer": 40, "autumn": 30},
    "Los Angeles": {"winter": 15, "spring": 18, "summer": 25, "autumn": 20},
    "Singapore": {"winter": 27, "spring": 28, "summer": 28, "autumn": 27},
    "Mumbai": {"winter": 25, "spring": 30, "summer": 35, "autumn": 30},
    "Cairo": {"winter": 15, "spring": 25, "summer": 35, "autumn": 25},
    "Mexico City": {"winter": 12, "spring": 18, "summer": 20, "autumn": 15},
}

# Сопоставление месяцев с сезонами
month_to_season = {12: "winter", 1: "winter", 2: "winter",
                   3: "spring", 4: "spring", 5: "spring",
                   6: "summer", 7: "summer", 8: "summer",
                   9: "autumn", 10: "autumn", 11: "autumn"}

# Генерация данных о температуре
def generate_realistic_temperature_data(cities, num_years=10):
    dates = pd.date_range(start="2010-01-01", periods=365 * num_years, freq="D")
    data = []

    for city in cities:
        for date in dates:
            season = month_to_season[date.month]
            mean_temp = seasonal_temperatures[city][season]
            # Добавляем случайное отклонение
            temperature = np.random.normal(loc=mean_temp, scale=5)
            data.append({"city": city, "timestamp": date, "temperature": temperature})

    df = pd.DataFrame(data)
    df['season'] = df['timestamp'].dt.month.map(lambda x: month_to_season[x])
    return df

# Генерация данных
data = generate_realistic_temperature_data(list(seasonal_temperatures.keys()))
data.to_csv('temperature_data.csv', index=False)

In [44]:
import os
import json
import requests
from pprint import pp
import time
import asyncio
import aiohttp
import seaborn as sns
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from ipywidgets import interact, interact_manual

In [45]:
import nest_asyncio
nest_asyncio.apply()

## 

In [46]:
df = pd.read_csv("temperature_data.csv")
df = df.sort_values(by=["city", "timestamp"]).reset_index(drop=True)

In [47]:
df.head()

Unnamed: 0,city,timestamp,temperature,season
0,Beijing,2010-01-01,6.086816,winter
1,Beijing,2010-01-02,-3.945024,winter
2,Beijing,2010-01-03,-2.111314,winter
3,Beijing,2010-01-04,1.876513,winter
4,Beijing,2010-01-05,-13.639838,winter


### Скользящее среднее

In [48]:
# 1st: One thread

%time
df["rolling_mean"] = None
for city in df["city"].unique():
    df.loc[df.city == city, "rolling_mean"] = df.loc[df.city == city, "temperature"].rolling(window=30).mean()

df.head()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.91 μs


Unnamed: 0,city,timestamp,temperature,season,rolling_mean
0,Beijing,2010-01-01,6.086816,winter,
1,Beijing,2010-01-02,-3.945024,winter,
2,Beijing,2010-01-03,-2.111314,winter,
3,Beijing,2010-01-04,1.876513,winter,
4,Beijing,2010-01-05,-13.639838,winter,


In [49]:
def add_rolling_mean(x):
    return x.assign(rolling_mean=x["temperature"].rolling(window=30).mean())

In [50]:
# 2nd: multiprocessing

%time
out = Parallel(n_jobs=4)(delayed(add_rolling_mean)(df.loc[df.city == city]) for city in df["city"].unique())
df = pd.concat(out)
df.head()

CPU times: user 1e+03 ns, sys: 1 μs, total: 2 μs
Wall time: 2.15 μs


Unnamed: 0,city,timestamp,temperature,season,rolling_mean
0,Beijing,2010-01-01,6.086816,winter,
1,Beijing,2010-01-02,-3.945024,winter,
2,Beijing,2010-01-03,-2.111314,winter,
3,Beijing,2010-01-04,1.876513,winter,
4,Beijing,2010-01-05,-13.639838,winter,


**Результат:** Второй вариант оказался медленнее, скорее всего это происходит из-за дороговизны создания новых тредов по сравнению с простотой выполнения главной таски

### Среднее и отклонение для каждого города и сезона

In [51]:
df_stats = df.groupby(by=["city", "season"], as_index=False).agg({"temperature": ["mean", "std"]})
df_stats.columns = ["_".join(i) if i[1] != "" else i[0] for i in df_stats.columns]
df_stats.head()

Unnamed: 0,city,season,temperature_mean,temperature_std
0,Beijing,autumn,16.150702,5.083753
1,Beijing,spring,13.278835,5.165594
2,Beijing,summer,26.751644,5.125755
3,Beijing,winter,-2.208042,5.176436
4,Berlin,autumn,11.00042,4.972042


### Поиск аномалий

In [52]:
df = df.merge(df_stats, on=["city", "season"], how="left")

In [53]:
df["anomaly"] = 0

condition = ((df["temperature"] < df.temperature_mean - 2 * df.temperature_std) | 
             (df["temperature"] > df.temperature_mean + 2 * df.temperature_std))

df.loc[condition, "anomaly"] = 1

In [91]:
df[df.anomaly == 1]

Unnamed: 0,city,timestamp,temperature,season,rolling_mean,temperature_mean,temperature_std,anomaly
4,Beijing,2010-01-05,-13.639838,winter,,-2.208042,5.176436,1
5,Beijing,2010-01-06,-14.848822,winter,,-2.208042,5.176436,1
6,Beijing,2010-01-07,-13.696568,winter,,-2.208042,5.176436,1
69,Beijing,2010-03-11,23.786642,spring,4.067599,13.278835,5.165594,1
92,Beijing,2010-04-03,0.952930,spring,13.173576,13.278835,5.165594,1
...,...,...,...,...,...,...,...,...
54610,Tokyo,2019-08-12,39.330856,summer,27.204592,27.090789,5.102641,1
54653,Tokyo,2019-09-24,5.852194,autumn,20.201651,18.004716,4.870011,1
54668,Tokyo,2019-10-09,8.207802,autumn,17.430110,18.004716,4.870011,1
54692,Tokyo,2019-11-02,4.132312,autumn,17.535162,18.004716,4.870011,1


Видим, что в нашем датасете 2512 аномалий

## OpenWeatherMap API

In [75]:
from dotenv import load_dotenv
load_dotenv()

True

In [82]:
API_KEY = os.getenv("OpenWeatherMapAPIKey")
URL_TEMPLATE = "https://api.openweathermap.org/data/2.5/weather?q={city}&units=metric&appid={API_KEY}"

### Текущая температура воздуха

In [83]:
@interact
def show_current_weather(city=df["city"].unique()):
    resp = requests.get(URL_TEMPLATE.format(
        city=city,
        API_KEY=API_KEY
    ))
    pp(json.loads(resp.text))

interactive(children=(Dropdown(description='city', options=('Beijing', 'Berlin', 'Cairo', 'Dubai', 'London', '…

In [84]:
show_current_weather("Moscow")

{'coord': {'lon': 37.6156, 'lat': 55.7522},
 'weather': [{'id': 801,
              'main': 'Clouds',
              'description': 'few clouds',
              'icon': '02n'}],
 'base': 'stations',
 'main': {'temp': -2.76,
          'feels_like': -6.84,
          'temp_min': -2.76,
          'temp_max': -2.71,
          'pressure': 1015,
          'humidity': 95,
          'sea_level': 1015,
          'grnd_level': 994},
 'visibility': 10000,
 'wind': {'speed': 3.02, 'deg': 222, 'gust': 9.11},
 'clouds': {'all': 11},
 'dt': 1734803519,
 'sys': {'type': 1,
         'id': 9027,
         'country': 'RU',
         'sunrise': 1734760662,
         'sunset': 1734785862},
 'timezone': 10800,
 'id': 524901,
 'name': 'Moscow',
 'cod': 200}


### Detect anomalies

In [85]:
@interact
def show_current_weather_with_anomalies(city=df["city"].unique()):
    resp = requests.get(URL_TEMPLATE.format(
        city=city,
        API_KEY=API_KEY
    ))
    curr_temp = json.loads(resp.text)["main"]["temp"]
    normal_mean, normal_std = (df_stats
                   .loc[(df_stats.city == city) & (df_stats.season == "winter"),
                        ["temperature_mean", "temperature_std"]]
                   .values[0])
    is_anomaly_detected = "not detected"
    if (curr_temp > normal_mean + 2 * normal_std) or (curr_temp < normal_mean - 2 * normal_std):
        is_anomaly_detected = "detected"
    
    print(f"City: {city}")
    print(f"Normal temperture bounds: ({normal_mean - 2 * normal_std} 'C, {normal_mean + 2 * normal_std} 'C)")
    print(f"Current temperature: {curr_temp} 'C, Anomaly {is_anomaly_detected}")

interactive(children=(Dropdown(description='city', options=('Beijing', 'Berlin', 'Cairo', 'Dubai', 'London', '…

In [90]:
for city in df["city"].unique():
    show_current_weather_with_anomalies(city)

City: Beijing
Normal temperture bounds: (-12.560913883061323 'C, 8.144830438081451 'C)
Current temperature: -3.06 'C, Anomaly not detected
City: Berlin
Normal temperture bounds: (-10.109502329353356 'C, 9.801769250553912 'C)
Current temperature: 6.08 'C, Anomaly not detected
City: Cairo
Normal temperture bounds: (4.883100981851358 'C, 25.31540048343649 'C)
Current temperature: 19.42 'C, Anomaly not detected
City: Dubai
Normal temperture bounds: (10.369670910123304 'C, 29.544855431264168 'C)
Current temperature: 20.96 'C, Anomaly not detected
City: London
Normal temperture bounds: (-5.064007036223989 'C, 14.926387036769876 'C)
Current temperature: 9.84 'C, Anomaly not detected
City: Los Angeles
Normal temperture bounds: (4.8455100400020665 'C, 25.10317590533984 'C)
Current temperature: 17.27 'C, Anomaly not detected
City: Mexico City
Normal temperture bounds: (2.128553904571735 'C, 22.35786562537335 'C)
Current temperature: 16.64 'C, Anomaly not detected
City: Moscow
Normal temperture b

После проверки городов, мы не видим аномалий

### Проверяем синхронные и асинхронные запросы

Достанем погоду для всех городов

In [87]:
def get_weather_synch(city):
    resp = requests.get(URL_TEMPLATE.format(
        city=city,
        API_KEY=API_KEY
    ))
    return json.loads(resp.text)

start = time.time()
for city in df.city.unique():
    get_weather_synch(city)
end = time.time()
print(f"Sync time: {end - start:.2f} sec.")

Sync time: 3.33 sec.


In [88]:
async def get_weather_asynch(city):
    url = URL_TEMPLATE.format(
        city=city,
        API_KEY=API_KEY
    )
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
                content = await response.text()
                return json.loads(content)
            
start = time.time()
await asyncio.gather(*[get_weather_asynch(city) for city in df.city.unique()])
end = time.time()
print(f"Async time: {end - start:.2f} sec.")

Async time: 0.62 sec.


**Async** намного быстрее **Sync**, что в целом ожидаемо