In [4]:
import requests
import pandas as pd

# Get COVID-19 historical data for Europe
url = "https://disease.sh/v3/covid-19/historical/Austria,Belgium,Bulgaria,Croatia,Cyprus,Czechia,Denmark,Estonia,Finland,France,Germany,Greece,Hungary,Ireland,Italy,Latvia,Lithuania,Luxembourg,Malta,Netherlands,Poland,Portugal,Romania,Slovakia,Slovenia,Spain,Sweden?lastdays=all"

response = requests.get(url)
data = response.json()
print(data)

[{'country': 'Austria', 'province': ['mainland'], 'timeline': {'cases': {'1/22/20': 0, '1/23/20': 0, '1/24/20': 0, '1/25/20': 0, '1/26/20': 0, '1/27/20': 0, '1/28/20': 0, '1/29/20': 0, '1/30/20': 0, '1/31/20': 0, '2/1/20': 0, '2/2/20': 0, '2/3/20': 0, '2/4/20': 0, '2/5/20': 0, '2/6/20': 0, '2/7/20': 0, '2/8/20': 0, '2/9/20': 0, '2/10/20': 0, '2/11/20': 0, '2/12/20': 0, '2/13/20': 0, '2/14/20': 0, '2/15/20': 0, '2/16/20': 0, '2/17/20': 0, '2/18/20': 0, '2/19/20': 0, '2/20/20': 0, '2/21/20': 0, '2/22/20': 0, '2/23/20': 0, '2/24/20': 0, '2/25/20': 2, '2/26/20': 1, '2/27/20': 1, '2/28/20': 1, '2/29/20': 3, '3/1/20': 7, '3/2/20': 8, '3/3/20': 12, '3/4/20': 17, '3/5/20': 23, '3/6/20': 37, '3/7/20': 46, '3/8/20': 75, '3/9/20': 98, '3/10/20': 126, '3/11/20': 148, '3/12/20': 203, '3/13/20': 312, '3/14/20': 409, '3/15/20': 591, '3/16/20': 801, '3/17/20': 999, '3/18/20': 1225, '3/19/20': 1571, '3/20/20': 1943, '3/21/20': 2398, '3/22/20': 2909, '3/23/20': 3455, '3/24/20': 4073, '3/25/20': 4760, '3

In [5]:
records = []

for entry in data:
    country = entry['country']
    timeline = entry['timeline']

    for date_str in timeline['cases']:
        records.append({
            'country': country,
            'date': pd.to_datetime(date_str, format='%m/%d/%y'),
            'cases': timeline['cases'][date_str]
        })

df = pd.DataFrame(records)

In [6]:
print(df.head)

<bound method NDFrame.head of        country       date    cases
0      Austria 2020-01-22        0
1      Austria 2020-01-23        0
2      Austria 2020-01-24        0
3      Austria 2020-01-25        0
4      Austria 2020-01-26        0
...        ...        ...      ...
30856   Sweden 2023-03-05  2698535
30857   Sweden 2023-03-06  2698535
30858   Sweden 2023-03-07  2698535
30859   Sweden 2023-03-08  2698535
30860   Sweden 2023-03-09  2699339

[30861 rows x 3 columns]>


In [7]:
print(df.columns.tolist())


['country', 'date', 'cases']


In [9]:
print(df[['country', 'date', 'cases']].head())



   country       date  cases
0  Austria 2020-01-22      0
1  Austria 2020-01-23      0
2  Austria 2020-01-24      0
3  Austria 2020-01-25      0
4  Austria 2020-01-26      0


In [10]:
print(df.isnull().sum())


country    0
date       0
cases      0
dtype: int64


In [11]:
print(df.value_counts())
df['country'].value_counts()


country  date        cases  
Sweden   2023-03-09  2699339    1
Austria  2020-01-22  0          1
         2020-01-23  0          1
         2020-01-24  0          1
         2020-01-25  0          1
                               ..
         2020-02-08  0          1
         2020-02-07  0          1
         2020-02-06  0          1
         2020-02-05  0          1
         2020-02-04  0          1
Name: count, Length: 30861, dtype: int64


country
Austria        1143
Belgium        1143
Bulgaria       1143
Croatia        1143
Cyprus         1143
Czechia        1143
Denmark        1143
Estonia        1143
Finland        1143
France         1143
Germany        1143
Greece         1143
Hungary        1143
Ireland        1143
Italy          1143
Latvia         1143
Lithuania      1143
Luxembourg     1143
Malta          1143
Netherlands    1143
Poland         1143
Portugal       1143
Romania        1143
Slovakia       1143
Slovenia       1143
Spain          1143
Sweden         1143
Name: count, dtype: int64

In [12]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30861 entries, 0 to 30860
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   country  30861 non-null  object        
 1   date     30861 non-null  datetime64[ns]
 2   cases    30861 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 723.4+ KB
None


In [13]:
df

Unnamed: 0,country,date,cases
0,Austria,2020-01-22,0
1,Austria,2020-01-23,0
2,Austria,2020-01-24,0
3,Austria,2020-01-25,0
4,Austria,2020-01-26,0
...,...,...,...
30856,Sweden,2023-03-05,2698535
30857,Sweden,2023-03-06,2698535
30858,Sweden,2023-03-07,2698535
30859,Sweden,2023-03-08,2698535


In [None]:
# We need to decide the time scale we are checking the change in air quality - months/year
# Which countries had air quality impacted the most by covid


In [14]:
# Check which countries have the highest change in cases over a period of time to then see how that impacted air quality


# Ensure df is sorted by country and date
df = df.sort_values(['country', 'date'])



In [19]:
# I want the find the country with the biggest change month by month

df['date'] = pd.to_datetime(df['date'])

#  year and month as separate columns
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


# Get the last reported case count in each month for each country - doing so allows us to see the change in cases month by month by seeing the difference in cases between the one month and the next
monthly_cases = df.groupby(['country', 'year', 'month'])['cases'].last().reset_index() 
monthly_cases['monthly_change'] = monthly_cases.groupby('country')['cases'].diff()

monthly_cases = monthly_cases.sort_values(['country', 'year', 'month'])

# Calculate month-over-month change in cases - comparing month prior to current month

print(monthly_cases.head(12))

    country  year  month   cases  monthly_change
0   Austria  2020      1       0             NaN
1   Austria  2020      2       3             3.0
2   Austria  2020      3    9264          9261.0
3   Austria  2020      4   15402          6138.0
4   Austria  2020      5   16610          1208.0
5   Austria  2020      6   17629          1019.0
6   Austria  2020      7   20873          3244.0
7   Austria  2020      8   27131          6258.0
8   Austria  2020      9   44231         17100.0
9   Austria  2020     10  101467         57236.0
10  Austria  2020     11  278500        177033.0
11  Austria  2020     12  356063         77563.0


In [20]:
# Sort by the largest monthly_change
top_monthly_changes = monthly_cases.sort_values('monthly_change', ascending=False)

print(top_monthly_changes.head(10))


     country  year  month     cases  monthly_change
375   France  2022      1  19266496       9188713.0
416  Germany  2022      3  21104509       6359402.0
415  Germany  2022      2  14745107       4929574.0
570    Italy  2022      1  10983116       4857433.0
999    Spain  2022      1   9961253       3666508.0
376   France  2022      2  22877926       3611430.0
417  Germany  2022      4  24710769       3606260.0
378   France  2022      4  28835895       3032722.0
377   France  2022      3  25803173       2925247.0
381   France  2022      7  33997224       2711907.0


In [21]:
import numpy as np

#  find the sum of absolute monthly changes per country i.e the total of the monthly change columns 
country_volatility = monthly_cases.groupby('country')['monthly_change'].agg(
    sum_abs_change=lambda x: np.sum(np.abs(x))
).reset_index().sort_values(by='sum_abs_change', ascending=False)

print(country_volatility.head(10))


        country  sum_abs_change
9        France      39866713.0
10      Germany      38249055.0
14        Italy      25603508.0
25        Spain      13770429.0
19  Netherlands       8712835.0
20       Poland       6444960.0
0       Austria       5961143.0
21     Portugal       5570473.0
11       Greece       5548487.0
1       Belgium       4739365.0


In [22]:
# Based on the above, we should focus on France year 2022, as it had the biggest changes in COVID cases month by month. Particulary from January to April 

# Now I need a dataset to see the regions/departments where COVID cases changed the most within France to check the air pollution level change

