# Big Data Task 2
<b>Authors</b>
<ul>
    <li>Piotr Janiszek</li>
    <li>Mateusz Dominiak</li>
<ul>

<h1>1. Charts & Analysis</h1>

Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from adjustText import adjust_text



Data Loading

In [None]:
df = pd.read_csv('data/combined.csv')
df.head()
print(df.columns)

cols = [
    'new_confirmed',
    'new_deceased',
    'new_vaccine_doses_administered',
    'cumulative_confirmed',
    'cumulative_deceased',
    'cumulative_vaccine_doses_administered'
]

group_sum = df.groupby('iso_3166_1_alpha_3')[cols].sum()

print(len(df))

# Wybieramy te państwa, dla których we wszystkich kolumnach suma jest równa zero
countries_to_remove = group_sum[(group_sum == 0).all(axis=1)].index

# Usuwamy wiersze odpowiadające krajom, które spełniają warunek
df = df[~df['iso_3166_1_alpha_3'].isin(countries_to_remove)]

print(len(df))


1.1. the number of new cases

1.2. the number of new deaths

1.3. the number of new vaccinations

1.4. population

1.5 GDP

1.6.1 To do

1.6.2 To do

1.6.3 To do

1.6.4 To do

<h1>2. Outliers</h1>

In [None]:
# 1. Usuwanie wartości odstających metodą Z-score dla new_confirmed
# Obliczamy Z-score dla kolumny 'new_confirmed'
z_scores = stats.zscore(df['new_confirmed'])

# Określamy próg, np. 3 (warto go dostosować do konkretnego przypadku)
threshold = 3

# Sprawdzamy, które wiersze mają wartość Z-score < 3 (lub > -3)
mask_new_confirmed = np.abs(z_scores) < threshold

# Filtrowanie DataFrame do wierszy niebędących odstającymi dla 'new_confirmed'
df_no_outliers_confirmed = df[mask_new_confirmed].copy()


In [None]:
# 2. Usuwanie wartości odstających metodą IQR dla new_deceased
Q1 = df_no_outliers_confirmed['new_deceased'].quantile(0.25)
Q3 = df_no_outliers_confirmed['new_deceased'].quantile(0.75)
IQR = Q3 - Q1

# Standardowy mnożnik dla IQR wynosi 1.5, można go dopasować.
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound}"
      f"\nUpper bound: {upper_bound}")

# Filtrowanie DataFrame do wierszy niebędących odstającymi dla 'new_deceased'
mask_new_deceased = (
        (df_no_outliers_confirmed['new_deceased'] >= lower_bound) &
        (df_no_outliers_confirmed['new_deceased'] <= upper_bound)
)

df = df_no_outliers_confirmed[mask_new_deceased].copy()

print(f"Number of rows before outliers removal: {len(df)}")


<h1>3. Statistical calculations</h1>

3.1. the average number of new illnesses, deaths and vaccinations for at least 10 selected countries in a selected month

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Filtering data for April 2021 (from April 1 to April 30)
df_april = df[(df['date'] >= '2021-04-01') & (df['date'] <= '2021-04-30')]

# Count the average for the columns of new cases, deaths and vaccinations
aggregations = {
    'new_confirmed': 'mean',
    'new_deceased': 'mean',
    'new_vaccine_doses_administered': 'mean'
}

# Data grouping for April 2021
group_april = df_april.groupby('iso_3166_1_alpha_3').agg(aggregations)

pd.set_option('display.max_rows', None)

# Average values for April 2021
display(group_april)

# Average values for the entire time interval
group_total = df.groupby('iso_3166_1_alpha_3').agg(aggregations)

display(group_total)


In [None]:
df['date'] = pd.to_datetime(df['date'])
print(df['Continent'].unique())


# --- Wykres 1: Wykres słupkowy - suma nowych przypadków według kontynentów ---

# Grupujemy dane według kontynentu
continent_totals = df.groupby('Continent')['new_confirmed'].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=continent_totals, x='Continent', y='new_confirmed', palette='viridis')
plt.title('Łączna liczba nowych przypadków według kontynentów')
plt.xlabel('Kontynent')
plt.ylabel('Suma nowych przypadków')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:

# --- Wykres 2: Wykres pudełkowy - rozkład nowych przypadków dla poszczególnych kontynentów ---

plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='Continent', y='new_confirmed', palette='Set2')
plt.yscale('log')  # Użycie skali logarytmicznej
plt.title('Rozkład nowych przypadków według kontynentów (skala logarytmiczna)')
plt.xlabel('Kontynent')
plt.ylabel('Nowe przypadki (logarytmiczna skala)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# --- Wykres 3: Wykres liniowy - średnia liczba nowych przypadków na przestrzeni czasu wg kontynentu ---

time_trend = df.groupby(['date', 'Continent'])['new_confirmed'].mean().reset_index()

plt.figure(figsize=(14, 7))
sns.lineplot(data=time_trend, x='date', y='new_confirmed', hue='Continent', palette='tab10')
plt.title('Średnia liczba nowych przypadków na przestrzeni czasu wg kontynentu')
plt.xlabel('Data')
plt.ylabel('Średnia liczba nowych przypadków')
plt.legend(title='Kontynent')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df_max = df.groupby('iso_3166_1_alpha_3').agg({
    'new_confirmed': 'max',
    'Continent': 'first'
}).reset_index()

# Ustawienia wykresu
plt.figure(figsize=(30, 16))
scatter_ax = sns.scatterplot(data=df_max, x='iso_3166_1_alpha_3', y='new_confirmed',
                             hue='Continent', palette='Set1', s=100)

plt.title('Maksymalne wartości nowych zachorowań dla każdego państwa')
plt.xlabel('Państwo (kod ISO)')
plt.ylabel('Maksymalna liczba nowych zachorowań')
plt.xticks(rotation=45)

texts = []
for idx, row in df_max.iterrows():
    texts.append(plt.text(row['iso_3166_1_alpha_3'], row['new_confirmed'], row['iso_3166_1_alpha_3'],
                          fontsize=9))

adjust_text(texts, arrowprops=dict(arrowstyle='->', color='#EBB7EF'))

scatter_ax.set_xticklabels([])


plt.tight_layout()
plt.show()


3.2. the average change in the number of illnesses, deaths and vaccinations for at least 10 selected countries in the selected month

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Sorting data by country and date
df_sorted = df.sort_values(['iso_3166_1_alpha_3', 'date'])

# Calculation of daily changes for each country
df_sorted['change_confirmed'] = df_sorted.groupby('iso_3166_1_alpha_3')['new_confirmed'].diff()
df_sorted['change_deceased'] = df_sorted.groupby('iso_3166_1_alpha_3')['new_deceased'].diff()
df_sorted['change_vaccine_doses'] = df_sorted.groupby('iso_3166_1_alpha_3')['new_vaccine_doses_administered'].diff()

# Delete rows with missing data (the first row for each country will not have a diff value)
df_changes = df_sorted.dropna(subset=['change_confirmed', 'change_deceased', 'change_vaccine_doses'])

# Filtering data for April 2021
df_april_changes = df_changes[(df_changes['date'] >= '2021-04-01') & (df_changes['date'] <= '2021-04-30')]

# Definition of aggregation for change
aggregations_changes = {
    'change_confirmed': 'mean',
    'change_deceased': 'mean',
    'change_vaccine_doses': 'mean'
}

# Grouping of change data for April 2021
group_april_changes = df_april_changes.groupby('iso_3166_1_alpha_3').agg(aggregations_changes)

# Set the option to display all rows
pd.set_option('display.max_rows', None)

# View average changes for April 2021
display(group_april_changes)

# Grouping of change data for the entire time interval
group_total_changes = df_changes.groupby('iso_3166_1_alpha_3').agg(aggregations_changes)

# Display of average changes for the entire time interval
display(group_total_changes)


3.3. median number of new illnesses, deaths and vaccinations for at least 10 selected countries in a selected month

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Filtering data for April 2021 (from April 1 to April 30)
df_april = df[(df['date'] >= '2021-04-01') & (df['date'] <= '2021-04-30')]

# Count the average for the columns of new cases, deaths and vaccinations
aggregations = {
    'new_confirmed': 'median',
    'new_deceased': 'median',
    'new_vaccine_doses_administered': 'median'
}

# Data grouping for April 2021
group_april = df_april.groupby('iso_3166_1_alpha_3').agg(aggregations)

pd.set_option('display.max_rows', None)

# Average values for April 2021
display(group_april)

# Average values for the entire time interval
group_total = df.groupby('iso_3166_1_alpha_3').agg(aggregations)

display(group_total)

3.4. the standard deviation of the number of new cases, deaths and vaccinations for at least 10 selected countries in the selected month

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Filtering data for April 2021 (from April 1 to April 30)
df_april = df[(df['date'] >= '2021-04-01') & (df['date'] <= '2021-04-30')]

# Count the average for the columns of new cases, deaths and vaccinations
aggregations = {
    'new_confirmed': 'std',
    'new_deceased': 'std',
    'new_vaccine_doses_administered': 'std'
}

# Data grouping for April 2021
group_april = df_april.groupby('iso_3166_1_alpha_3').agg(aggregations)

pd.set_option('display.max_rows', None)

# Average values for April 2021
display(group_april)

# Average values for the entire time interval
group_total = df.groupby('iso_3166_1_alpha_3').agg(aggregations)

display(group_total)

3.5. minimum, average and maximum number of new illnesses, vaccinations and deaths, and GDP of at least 10 selected countries

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Filtering data for April 2021 (from April 1 to April 30)
df_april = df[(df['date'] >= '2021-04-01') & (df['date'] <= '2021-04-30')]

# Count the average for the columns of new cases, deaths and vaccinations
aggregations = {
    'new_confirmed': ['min', 'mean', 'max'],
    'new_deceased': ['min', 'mean', 'max'],
    'new_vaccine_doses_administered': ['min', 'mean', 'max'],
    'GDP': ['min', 'mean', 'max']
}

# Data grouping for April 2021
group_april = df_april.groupby('iso_3166_1_alpha_3').agg(aggregations)

pd.set_option('display.max_rows', None)

# Average values for April 2021
display(group_april)

# Average values for the entire time interval
group_total = df.groupby('iso_3166_1_alpha_3').agg(aggregations)

display(group_total)

3.6. minimum, average, and maximum number of new illnesses, vaccinations, and deaths, and the population of at least 10 selected countries

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Filtering data for April 2021 (from April 1 to April 30)
df_april = df[(df['date'] >= '2021-04-01') & (df['date'] <= '2021-04-30')]

# Count the average for the columns of new cases, deaths and vaccinations
aggregations = {
    'new_confirmed': ['min', 'mean', 'max'],
    'new_deceased': ['min', 'mean', 'max'],
    'new_vaccine_doses_administered': ['min', 'mean', 'max'],
    'Population': ['min', 'mean', 'max']
}

# Data grouping for April 2021
group_april = df_april.groupby('iso_3166_1_alpha_3').agg(aggregations)

pd.set_option('display.max_rows', None)

# Average values for April 2021
display(group_april)

# Average values for the entire time interval
group_total = df.groupby('iso_3166_1_alpha_3').agg(aggregations)

display(group_total)

<h1>4. Data normalization</h1>

4.1. the number of illnesses per the number of people you specify (e.g., per 1,000 people)

In [None]:
df["new_confirmed_normalized"] = df["new_confirmed"] / df["Population"] * 1000
df["cumulative_confirmed_normalized"] = df["cumulative_confirmed"] / df["Population"] * 1000


4.2. the number of vaccinations per the number of people you specify (e.g., per 1,000 people)

In [None]:
df["new_vaccine_doses_administered_normalized"] = df["new_vaccine_doses_administered"] / df["Population"] * 1000
df["cumulative_vaccine_doses_administered_normalized"] = df["cumulative_vaccine_doses_administered"] / df["Population"] * 1000

4.3. the number of deaths per the number of people you specify (e.g., per 1,000 people)

In [None]:
df["new_deceased_normalized"] = df["new_deceased"] / df["Population"] * 1000
df["cumulative_deceased_normalized"] = df["cumulative_deceased"] / df["Population"] * 1000


4.4.1 GDP

In [None]:
df["GDP_normalized"] = df["GDP"] / df["Population"] * 1000


4.4.2 the number of test per 1000 people

In [None]:
df["new_tested_normalized"] = df["new_tested"] / df["Population"] * 1000
df["cumulative_tested_normalized"] = df["cumulative_tested"] / df["Population"] * 1000

4.4.3 to do

4.4.4 to do

In [None]:


print(df.head())
print(df.columns)


4.5 Charts

In [None]:
# Lista znormalizowanych kolumn
normalized_cols = [
    'new_confirmed_normalized',
    'cumulative_confirmed_normalized',

    'new_deceased_normalized',
    'cumulative_deceased_normalized',

    'new_vaccine_doses_administered_normalized',
    'cumulative_vaccine_doses_administered_normalized',

    'new_tested_normalized',
    'cumulative_tested_normalized',

    'GDP_normalized',
]

# Ustawienia stylu wykresów
sns.set(style='whitegrid', context='talk')

# Wykres histogramów z krzywą KDE dla każdej kolumny
plt.figure(figsize=(15, 40))
for i, col in enumerate(normalized_cols, 1):
    plt.subplot(5, 2, i)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f'Rozkład: {col}')
    plt.xlabel('Wartość')
    plt.ylabel('Liczba obserwacji')

plt.tight_layout()
plt.show()


In [None]:
# Wykres pudełkowy (boxplot) porównujący rozkłady znormalizowanych danych

num_plots = len(normalized_cols)
ncols = 2
nrows = int(np.ceil(num_plots / ncols))

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 12, nrows * 14))

axs = np.array(axs).flatten()

for ax, col in zip(axs, normalized_cols):
    sns.boxplot(y=df[col], ax=ax)
    ax.set_title(f"Rozkład: {col}")
    ax.set_xlabel("")
    ax.set_ylabel("Wartość znormalizowana")
    ax.xaxis.set_tick_params(rotation=45)

for ax in axs[len(normalized_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()



In [None]:
num_plots = len(normalized_cols)
ncols = 2
nrows = int(np.ceil(num_plots / ncols))

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 12, nrows * 14))
axs = np.array(axs).flatten()

for ax, col in zip(axs, normalized_cols):
    sns.violinplot(y=df[col], ax=ax)
    ax.set_title(f"Rozkład: {col}")
    ax.set_xlabel("")
    ax.set_ylabel("Wartość znormalizowana")
    ax.xaxis.set_tick_params(rotation=45)

# Ukrycie pustych subplotów, jeśli liczba pól przekracza liczbę danych
for ax in axs[len(normalized_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
num_plots = len(normalized_cols)
ncols = 2
nrows = int(np.ceil(num_plots / ncols))

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 12, nrows * 14))
axs = np.array(axs).flatten()

for ax, col in zip(axs, normalized_cols):
    sns.kdeplot(data=df[col], fill=True, ax=ax)
    ax.set_title(f"Rozkład gęstości: {col}")
    ax.set_xlabel("Wartość znormalizowana")
    ax.set_ylabel("Gęstość")
    ax.xaxis.set_tick_params(rotation=45)

# Ukrycie pustych subplotów, jeśli liczba wykresów przekracza liczbę kolumn danych
for ax in axs[len(normalized_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='cumulative_confirmed_normalized', y='cumulative_deceased_normalized')
plt.title("Wykres punktowy: cumulative_confirmed_normalized vs cumulative_deceased_normalized")
plt.xlabel("cumulative_confirmed_normalized")
plt.ylabel("cumulative_deceased_normalized")
plt.show()

plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='new_confirmed_normalized', y='new_deceased_normalized')
plt.title("Wykres punktowy: new_confirmed_normalized vs new_deceased_normalized")
plt.xlabel("new_confirmed_normalized")
plt.ylabel("new_deceased_normalized")
plt.show()


In [None]:
num_plots = len(normalized_cols)
ncols = 2
nrows = int(np.ceil(num_plots / ncols))

fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols * 12, nrows * 14))
axs = np.array(axs).flatten()

for ax, col in zip(axs, normalized_cols):
    sns.boxenplot(y=df[col], ax=ax)
    ax.set_title(f"Rozkład (boxenplot): {col}")
    ax.set_xlabel("")
    ax.set_ylabel("Wartość znormalizowana")
    ax.xaxis.set_tick_params(rotation=45)

# Ukrywamy niewykorzystane osie w przypadku, gdy liczba kolumn jest nieparzysta
for ax in axs[len(normalized_cols):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()


<h1>5. Data correlation</h1>

In [None]:
"""
df['date'] = pd.to_datetime(df['date'])
df_sorted = df.sort_values('date')

last_measurements = df_sorted.groupby('iso_3166_1_alpha_3').last()
"""

columns = ['new_confirmed', 'new_deceased', 'new_vaccine_doses_administered']

# Calculation of the correlation matrix
correlation_matrix = df[columns].corr()

new_names = {
    'new_confirmed': 'New Confirmations',
    'new_deceased': 'New Deaths',
    'new_vaccine_doses_administered': 'New Vaccine Doses',
}

# Changing index and column labels in the correlation matrix
correlation_matrix = correlation_matrix.rename(index=new_names, columns=new_names)

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
columns = ['new_confirmed', 'new_deceased', 'new_vaccine_doses_administered', 'cumulative_confirmed', 'cumulative_deceased', 'cumulative_vaccine_doses_administered']

# Calculation of the correlation matrix
correlation_matrix = df[columns].corr()

new_names = {
    'new_confirmed': 'New Confirmations',
    'new_deceased': 'New Deaths',
    'new_vaccine_doses_administered': 'New Vaccine Doses',
    'cumulative_confirmed': 'Cumulative Confirmations',
    'cumulative_deceased': 'Cumulative Deaths',
    'cumulative_vaccine_doses_administered': 'Cumulative Vaccine Doses Administered'
}

# Changing index and column labels in the correlation matrix
correlation_matrix = correlation_matrix.rename(index=new_names, columns=new_names)

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
columns = ['new_confirmed_normalized', 'new_deceased_normalized', 'new_vaccine_doses_administered_normalized', 'GDP_normalized', 'Unemployment, total (% of total labor force)', 'median_salary', 'average_salary', 'lowest_salary', 'highest_salary']

# Calculation of the correlation matrix
correlation_matrix = df[columns].corr()

new_names = {
    'new_confirmed_normalized': 'New Confirmations (per 1k)',
    'new_deceased_normalized': 'New Deaths (per 1k)',
    'new_vaccine_doses_administered_normalized': 'New Vaccine Doses (per 1k)',
    'GDP_normalized': 'GDP (per capita)',
    'Unemployment, total (% of total labor force)': 'Unemployment (% of total labor force)',
    'median_salary': 'Median Salary',
    'average_salary': 'Average Salary',
    'lowest_salary': 'Lowest Salary',
    'highest_salary': 'Highest Salary'
}

# Changing index and column labels in the correlation matrix
correlation_matrix = correlation_matrix.rename(index=new_names, columns=new_names)

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
columns = ['new_confirmed_normalized', 'new_deceased_normalized', 'new_vaccine_doses_administered_normalized', 'Population', 'Density (per km²)', 'Growth Rate']

# Calculation of the correlation matrix
correlation_matrix = df[columns].corr()

new_names = {
    'new_confirmed_normalized': 'New Confirmations (per 1k)',
    'new_deceased_normalized': 'New Deaths (per 1k)',
    'new_vaccine_doses_administered_normalized': 'New Vaccine Doses (per 1k)',
    'Population': 'Population',
    'Density (per km²)': 'Density (per km²)',
    'Growth Rate': 'Growth Rate'
}

# Changing index and column labels in the correlation matrix
correlation_matrix = correlation_matrix.rename(index=new_names, columns=new_names)

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()


In [None]:
df['date'] = pd.to_datetime(df['date'])
df_filtered = df.dropna(subset=['new_confirmed_normalized', 'new_deceased_normalized', 'cumulative_confirmed_normalized', 'cumulative_deceased_normalized', 'new_vaccine_doses_administered_normalized', 'cumulative_vaccine_doses_administered_normalized'])


# Używamy groupby oraz idxmax do pobrania indeksów wierszy z maksymalną wartością w kolumnie 'cumulative_confirmed_normalized'
max_idx1 = df_filtered.groupby('iso_3166_1_alpha_3')['new_confirmed_normalized'].idxmax()
max_idx2 = df_filtered.groupby('iso_3166_1_alpha_3')['new_deceased_normalized'].idxmax()
max_idx3 = df_filtered.groupby('iso_3166_1_alpha_3')['cumulative_confirmed_normalized'].idxmax()
max_idx4 = df_filtered.groupby('iso_3166_1_alpha_3')['cumulative_deceased_normalized'].idxmax()
max_idx5 = df_filtered.groupby('iso_3166_1_alpha_3')['new_vaccine_doses_administered_normalized'].idxmax()
max_idx6 = df_filtered.groupby('iso_3166_1_alpha_3')['cumulative_vaccine_doses_administered_normalized'].idxmax()

# Wybór wierszy na podstawie indeksów
max_confirmed_new = df_filtered.loc[max_idx1]
max_deceased_new = df_filtered.loc[max_idx2]
max_confirmed_cumulative = df_filtered.loc[max_idx3]
max_deceased_cumulative = df_filtered.loc[max_idx4]
max_vaccine_new = df_filtered.loc[max_idx5]
max_vaccine_cumulative = df_filtered.loc[max_idx6]


columns_max_confirmed_new = ['new_confirmed_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

columns_max_deceased_new = ['new_deceased_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

columns_max_vaccine_new = ['new_vaccine_doses_administered_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

columns_max_confirmed_cumulative = ['cumulative_confirmed_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

columns_max_deceased_cumulative = ['cumulative_deceased_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

columns_max_vaccine_cumulative = ['cumulative_vaccine_doses_administered_normalized', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']



# Calculation of the correlation matrix
correlation_matrix_confirmed_new = max_confirmed_new[columns_max_confirmed_new].corr()
correlation_matrix_deceased_new = max_deceased_new[columns_max_deceased_new].corr()
correlation_matrix_vaccine_new = max_deceased_new[columns_max_vaccine_new].corr()
correlation_matrix_confirmed_cumulative = max_confirmed_cumulative[columns_max_confirmed_cumulative].corr()
correlation_matrix_deceased_cumulative = max_deceased_cumulative[columns_max_deceased_cumulative].corr()
correlation_matrix_vaccine_cumulative = max_deceased_cumulative[columns_max_vaccine_cumulative].corr()

new_names = {
    'new_confirmed_normalized': 'New Confirmations (per 1k)',
    'new_deceased_normalized': 'New Deaths (per 1k)',
    'new_vaccine_doses_administered_normalized': 'New Vaccine Doses (per 1k)',
    'smoking_prevalence': 'Smoking Prevalence',
    'diabetes_prevalence': 'Diabetes Prevalence',
    'hospital_beds_per_1000': 'Hospital Beds (per 1k)',
    'nurses_per_1000': 'Nurses (per 1k)',
    'physicians_per_1000': 'Physicians (per 1k)',
    'health_expenditure_usd': 'Health Expenditure (USD)'

}

# Changing index and column labels in the correlation matrix
correlation_matrix_confirmed_new = correlation_matrix_confirmed_new.rename(index=new_names, columns=new_names)
correlation_matrix_deceased_new = correlation_matrix_deceased_new.rename(index=new_names, columns=new_names)
correlation_matrix_vaccine_new = correlation_matrix_vaccine_new.rename(index=new_names, columns=new_names)
correlation_matrix_confirmed_cumulative = correlation_matrix_confirmed_cumulative.rename(index=new_names, columns=new_names)
correlation_matrix_deceased_cumulative = correlation_matrix_deceased_cumulative.rename(index=new_names, columns=new_names)
correlation_matrix_vaccine_cumulative = correlation_matrix_vaccine_cumulative.rename(index=new_names, columns=new_names)

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_confirmed_new, annot=True, cmap='coolwarm')
plt.show()

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_deceased_new, annot=True, cmap='coolwarm')
plt.show()

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_confirmed_cumulative, annot=True, cmap='coolwarm')
plt.show()

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_deceased_cumulative, annot=True, cmap='coolwarm')
plt.show()

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_vaccine_new, annot=True, cmap='coolwarm')
plt.show()

plt.figure(figsize=(12, 8))

sns.heatmap(correlation_matrix_vaccine_cumulative, annot=True, cmap='coolwarm')
plt.show()


In [None]:
columns = ['new_confirmed_normalized', 'new_deceased_normalized', 'new_vaccine_doses_administered_normalized', 'GDP_normalized', 'new_tested_normalized', 'cumulative_confirmed', 'cumulative_deceased', 'cumulative_vaccine_doses_administered', 'Unemployment, total (% of total labor force)', 'median_salary', 'average_salary', 'lowest_salary', 'highest_salary', 'Population', 'Density (per km²)', 'Growth Rate', 'smoking_prevalence', 'diabetes_prevalence', 'hospital_beds_per_1000', 'nurses_per_1000', 'physicians_per_1000', 'health_expenditure_usd']

# Calculation of the correlation matrix
correlation_matrix = df[columns].corr()

new_names = {
    'new_confirmed_normalized': 'New Confirmations (per 1k)',
    'new_deceased_normalized': 'New Deaths (per 1k)',
    'new_vaccine_doses_administered_normalized': 'New Vaccine Doses (per 1k)',
    'GDP_normalized': 'GDP (per 1k)',
    'new_tested_normalized': 'New Tests (per 1k)',
    'cumulative_confirmed': 'Cumulative Confirmations',
    'cumulative_deceased': 'Cumulative Deaths',
    'cumulative_vaccine_doses_administered': 'Cumulative Vaccine Doses Administered',
    'Unemployment, total (% of total labor force)': 'Unemployment (% of total labor force)',
    'median_salary': 'Median Salary',
    'average_salary': 'Average Salary',
    'lowest_salary': 'Lowest Salary',
    'highest_salary': 'Highest Salary',
    'Population': 'Population',
    'Density (per km²)': 'Density (per km²)',
    'Growth Rate': 'Growth Rate',
    'smoking_prevalence': 'Smoking Prevalence',
    'diabetes_prevalence': 'Diabetes Prevalence',
    'hospital_beds_per_1000': 'Hospital Beds (per 1k)',
    'nurses_per_1000': 'Nurses (per 1k)',
    'physicians_per_1000': 'Physicians (per 1k)',
    'health_expenditure_usd': 'Health Expenditure (USD)'
}

# Changing index and column labels in the correlation matrix
correlation_matrix = correlation_matrix.rename(index=new_names, columns=new_names)

plt.figure(figsize=(28, 14))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
