# Data Cleaning

## 1. Impotando Librerias

In [224]:
import pandas as pd
import numpy as np
from datetime import datetime

## 2. Creacion de `df` con `covid-variants.csv`

In [225]:
df = pd.read_csv('../data_extra/covid-variants.csv')

In [226]:
df

Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-07-06,Alpha,0,0.0,3
1,Angola,2020-07-06,B.1.1.277,0,0.0,3
2,Angola,2020-07-06,B.1.1.302,0,0.0,3
3,Angola,2020-07-06,B.1.1.519,0,0.0,3
4,Angola,2020-07-06,B.1.160,0,0.0,3
...,...,...,...,...,...,...
94675,Zimbabwe,2021-11-01,Omicron,0,0.0,6
94676,Zimbabwe,2021-11-01,S:677H.Robin1,0,0.0,6
94677,Zimbabwe,2021-11-01,S:677P.Pelican,0,0.0,6
94678,Zimbabwe,2021-11-01,others,0,0.0,6


In [227]:
df.dtypes

location                object
date                    object
variant                 object
num_sequences            int64
perc_sequences         float64
num_sequences_total      int64
dtype: object

In [228]:
df['location'] = df['location'].apply(lambda x: x.replace(' ',''))

In [229]:
df = df.drop(['num_sequences','perc_sequences','num_sequences_total'], axis=1)

In [230]:
df['variant'].unique()

array(['Alpha', 'B.1.1.277', 'B.1.1.302', 'B.1.1.519', 'B.1.160',
       'B.1.177', 'B.1.221', 'B.1.258', 'B.1.367', 'B.1.620', 'Beta',
       'Delta', 'Epsilon', 'Eta', 'Gamma', 'Iota', 'Kappa', 'Lambda',
       'Mu', 'Omicron', 'S:677H.Robin1', 'S:677P.Pelican', 'others',
       'non_who'], dtype=object)

In [231]:
filter_colum_day = df['variant'] != 'others' 
df = df[filter_colum_day]
filter_colum_day = df['variant'] != 'non_who' 
df = df[filter_colum_day]

In [232]:
df = df.reset_index(drop=True)

In [233]:
df = df.rename(columns={'location':'country'})

In [234]:
df['date'] = pd.to_datetime(df.date)

# Declaro variables
y = df['date'].dt
x = df['date'].dt.isocalendar().week.apply(np.int64)

# Creo nuevas columnas con tipo int64
df['year'] = y.year
df['week'] = x

def str_fixer(value):
    if int(value) < 10:
        return f'0{value}'
    else:
        return str(value)

df["week"] = df["week"].apply(str_fixer)

df["yearWeek"] = df["year"].apply(str) + "-" + df["week"]

In [235]:
b = list(df["yearWeek"].unique())
b.sort()
b

['2020-20',
 '2020-22',
 '2020-24',
 '2020-26',
 '2020-28',
 '2020-30',
 '2020-32',
 '2020-34',
 '2020-36',
 '2020-38',
 '2020-40',
 '2020-42',
 '2020-44',
 '2020-46',
 '2020-48',
 '2020-50',
 '2020-52',
 '2021-01',
 '2021-02',
 '2021-04',
 '2021-06',
 '2021-08',
 '2021-10',
 '2021-12',
 '2021-14',
 '2021-16',
 '2021-18',
 '2021-20',
 '2021-22',
 '2021-24',
 '2021-26',
 '2021-28',
 '2021-30',
 '2021-32',
 '2021-34',
 '2021-36',
 '2021-38',
 '2021-40',
 '2021-42',
 '2021-44',
 '2021-46',
 '2021-48',
 '2021-50']

In [236]:
df

Unnamed: 0,country,date,variant,year,week,yearWeek
0,Angola,2020-07-06,Alpha,2020,28,2020-28
1,Angola,2020-07-06,B.1.1.277,2020,28,2020-28
2,Angola,2020-07-06,B.1.1.302,2020,28,2020-28
3,Angola,2020-07-06,B.1.1.519,2020,28,2020-28
4,Angola,2020-07-06,B.1.160,2020,28,2020-28
...,...,...,...,...,...,...
86785,Zimbabwe,2021-11-01,Lambda,2021,44,2021-44
86786,Zimbabwe,2021-11-01,Mu,2021,44,2021-44
86787,Zimbabwe,2021-11-01,Omicron,2021,44,2021-44
86788,Zimbabwe,2021-11-01,S:677H.Robin1,2021,44,2021-44


## 4. Anado datos geograficos y poblacion a `df`

In [237]:
df_loc = pd.read_csv("../data_extra/concap.csv")
df_loc

Unnamed: 0,CountryName,CapitalName,CapitalLatitude,CapitalLongitude,CountryCode,ContinentName
0,Somaliland,Hargeisa,9.550000,44.050000,,Africa
1,South Georgia and South Sandwich Islands,King Edward Point,-54.283333,-36.500000,GS,Antarctica
2,French Southern and Antarctic Lands,Port-aux-Français,-49.350000,70.216667,TF,Antarctica
3,Palestine,Jerusalem,31.766667,35.233333,PS,Asia
4,Aland Islands,Mariehamn,60.116667,19.900000,AX,Europe
...,...,...,...,...,...,...
240,Northern Cyprus,North Nicosia,35.183333,33.366667,,Europe
241,Hong Kong,,0.000000,0.000000,HK,Asia
242,Heard Island and McDonald Islands,,0.000000,0.000000,HM,Antarctica
243,British Indian Ocean Territory,Diego Garcia,-7.300000,72.400000,IO,Africa


In [238]:
df_loc['CountryName'] = df_loc['CountryName'].apply(lambda x: x.replace(' ',''))

In [239]:
df_loc = df_loc.drop(['CapitalName','CountryCode'], axis=1)
df_loc = df_loc.drop_duplicates()

df_loc = df_loc.rename(columns={'CountryName':'country',
                          'CapitalLatitude':'latitude', 
                          'CapitalLongitude':'longitude', 
                          'ContinentName':'continentExp'})
df_loc.head(3)


Unnamed: 0,country,latitude,longitude,continentExp
0,Somaliland,9.55,44.05,Africa
1,SouthGeorgiaandSouthSandwichIslands,-54.283333,-36.5,Antarctica
2,FrenchSouthernandAntarcticLands,-49.35,70.216667,Antarctica


In [240]:
df = pd.merge(df, df_loc , how='inner', on='country')

In [241]:
df = df.set_index('date')

In [242]:
x = df
df_DD = x.drop_duplicates()

print(f"Filas df: {x.shape[0]}\nFilas df sin duplicados: {df_DD.shape[0]}")
n_duplicados = x.shape[0] - df_DD.shape[0]
print(f"Hay {n_duplicados} filas duplicadas")
n_null = x.isnull().sum().sum()
print(f"Hay {n_null} registros nulos en total")
print(x.isnull().sum())

Filas df: 83138
Filas df sin duplicados: 83138
Hay 0 filas duplicadas
Hay 0 registros nulos en total
country         0
variant         0
year            0
week            0
yearWeek        0
latitude        0
longitude       0
continentExp    0
dtype: int64


In [243]:
df

Unnamed: 0_level_0,country,variant,year,week,yearWeek,latitude,longitude,continentExp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-07-06,Angola,Alpha,2020,28,2020-28,-8.833333,13.216667,Africa
2020-07-06,Angola,B.1.1.277,2020,28,2020-28,-8.833333,13.216667,Africa
2020-07-06,Angola,B.1.1.302,2020,28,2020-28,-8.833333,13.216667,Africa
2020-07-06,Angola,B.1.1.519,2020,28,2020-28,-8.833333,13.216667,Africa
2020-07-06,Angola,B.1.160,2020,28,2020-28,-8.833333,13.216667,Africa
...,...,...,...,...,...,...,...,...
2021-11-01,Zimbabwe,Lambda,2021,44,2021-44,-17.816667,31.033333,Africa
2021-11-01,Zimbabwe,Mu,2021,44,2021-44,-17.816667,31.033333,Africa
2021-11-01,Zimbabwe,Omicron,2021,44,2021-44,-17.816667,31.033333,Africa
2021-11-01,Zimbabwe,S:677H.Robin1,2021,44,2021-44,-17.816667,31.033333,Africa


In [244]:
df.to_csv('../VariantsDB.csv')