In [1]:
import pandas as pd 

In [2]:
from sklearn.datasets import load_iris, load_boston
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [3]:
# Leemos el DataFrame sobre el que vamos a trabajar
covid = pd.read_csv("owid-covid-data.csv")

# Cambiamos el formato de la columna data a datetime
covid['date'] = pd.to_datetime(covid['date'], format='%Y-%m-%d')

# Renombramos la columna "location" y la llamamos "country" y "human_development_index": "HDI"
covid.rename(columns={"location": "country", "human_development_index": "HDI"}, inplace=True)
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89679 entries, 0 to 89678
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   iso_code                               89679 non-null  object        
 1   continent                              85406 non-null  object        
 2   country                                89679 non-null  object        
 3   date                                   89679 non-null  datetime64[ns]
 4   total_cases                            86654 non-null  float64       
 5   new_cases                              86652 non-null  float64       
 6   new_cases_smoothed                     85651 non-null  float64       
 7   total_deaths                           76856 non-null  float64       
 8   new_deaths                             77014 non-null  float64       
 9   new_deaths_smoothed                    85651 non-null  float6

In [4]:
# Creamos un DataFrame con las columnas que queremos
covid_data = covid.filter(["continent", "country", "median_age", "aged_65_older", "aged_70_older", "total_cases", "total_deaths", "hospital_beds_per_thousand", "gdp_per_capita", "HDI"], axis=1)

# Añadimos nuevas columnas: el ratio de muertes y el porcentaje de personas con menos y más riesgo de muerte
covid_data["deaths_ratio"] = covid_data["total_deaths"]/covid_data["total_cases"] * 100
covid_data["non_risky_age"] = 100 - covid_data["aged_65_older"] - covid_data["aged_70_older"]
covid_data["risky_age"] = 100 - covid_data["non_risky_age"]

# Reordeno las columnas
covid_data = covid_data[["continent", "country", "median_age", "aged_65_older", "aged_70_older", "non_risky_age", "risky_age", "total_cases", "total_deaths", "deaths_ratio", "hospital_beds_per_thousand", "gdp_per_capita", "HDI"]]

# Eliminamos los valores Nan y reseteamos el índice
covid_data.dropna(inplace=True)
covid_data.reset_index(drop=True, inplace=True)
covid_data

Unnamed: 0,continent,country,median_age,aged_65_older,aged_70_older,non_risky_age,risky_age,total_cases,total_deaths,deaths_ratio,hospital_beds_per_thousand,gdp_per_capita,HDI
0,Asia,Afghanistan,18.6,2.581,1.337,96.082,3.918,34.0,1.0,2.941176,0.5,1803.987,0.511
1,Asia,Afghanistan,18.6,2.581,1.337,96.082,3.918,41.0,1.0,2.439024,0.5,1803.987,0.511
2,Asia,Afghanistan,18.6,2.581,1.337,96.082,3.918,43.0,1.0,2.325581,0.5,1803.987,0.511
3,Asia,Afghanistan,18.6,2.581,1.337,96.082,3.918,76.0,2.0,2.631579,0.5,1803.987,0.511
4,Asia,Afghanistan,18.6,2.581,1.337,96.082,3.918,80.0,3.0,3.750000,0.5,1803.987,0.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61945,Africa,Zimbabwe,19.6,2.822,1.882,95.296,4.704,38535.0,1582.0,4.105359,1.7,1899.775,0.571
61946,Africa,Zimbabwe,19.6,2.822,1.882,95.296,4.704,38554.0,1582.0,4.103336,1.7,1899.775,0.571
61947,Africa,Zimbabwe,19.6,2.822,1.882,95.296,4.704,38560.0,1582.0,4.102697,1.7,1899.775,0.571
61948,Africa,Zimbabwe,19.6,2.822,1.882,95.296,4.704,38572.0,1582.0,4.101421,1.7,1899.775,0.571


In [5]:
covid_groupby = covid_data.groupby("country").max()
covid_groupby.reset_index(inplace=True)
covid_groupby

Unnamed: 0,country,continent,median_age,aged_65_older,aged_70_older,non_risky_age,risky_age,total_cases,total_deaths,deaths_ratio,hospital_beds_per_thousand,gdp_per_capita,HDI
0,Afghanistan,Asia,18.6,2.581,1.337,96.082,3.918,63819.0,2751.0,4.424171,0.50,1803.987,0.511
1,Albania,Europe,38.0,13.188,8.643,78.169,21.831,132071.0,2436.0,8.333333,2.89,11803.431,0.795
2,Algeria,Africa,29.1,6.211,3.857,89.932,10.068,125693.0,3388.0,15.784165,1.90,13913.839,0.748
3,Antigua and Barbuda,North America,32.1,6.933,4.631,88.436,11.564,1251.0,42.0,13.043478,3.80,21490.943,0.778
4,Argentina,South America,31.9,11.198,7.441,81.361,18.639,3371508.0,71771.0,8.333333,5.00,18933.907,0.845
...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Venezuela,South America,29.0,6.614,3.915,89.471,10.529,217603.0,2428.0,5.389222,0.80,16745.022,0.711
151,Vietnam,Asia,32.6,7.150,4.718,88.132,11.868,4512.0,37.0,3.346080,2.60,6171.884,0.704
152,Yemen,Asia,20.3,2.922,1.583,95.495,4.505,6586.0,1297.0,33.333333,0.70,1479.147,0.470
153,Zambia,Africa,17.7,2.480,1.542,95.978,4.022,92520.0,1263.0,5.000000,2.00,3689.251,0.584


In [None]:
# Leemos el csv de pobreza
pobreza = pd.read_csv("pobreza_ratio.csv")
pobreza.rename(columns={"Country": "country"}, inplace=True)

# Unimos el DataFrame "covid_data_groupby" con este de pobreza
covid_data_pobreza = pd.merge(covid_groupby, pobreza, on="country")
covid_data_pobreza.drop("Unnamed: 0", axis=1, inplace=True)
covid_data_pobreza


In [None]:
covid_data.to_csv("0-covid_data.csv")
covid_groupby.to_csv("0-covid_groupby.csv")
covid_data_pobreza.to_csv("0_covid_pobreza.csv")

In [None]:
# Hacemos un heatmap para ver la correlacion entre las diferentes columnas del DataFrame
plt.figure(figsize=(10,10))
sns.heatmap(covid_data_pobreza.corr(), annot=True)

In [None]:
covid_data_pobreza.corr()

"""Interpretación de las correlaciones: 

median_age : 
- Mucha correlación positiva: cuanto mayor 