# Combine zika virus dataset with population density

In [1]:
# import library
import pandas as pd
import numpy as np

import re
import datetime

import warnings
warnings.simplefilter("ignore")

pd.pandas.set_option('display.max_columns',None)

# ZikaVirus Dataset
Countries : 
1. Argentina 
1. Brazil
1. Colombia
1. Dominican Republic
1. Ecuador
1. El Salvador
1. Guatemala
1. Haiti
1. Mexico
1. Nicaragua
1. Pamana
1. Puerto Rico
1. U.S. Virgin Islands
1. United States

In [64]:
zika = pd.read_csv('zikavirus_dataset.csv')
zika.head()

Unnamed: 0,report_date,location,location_type,data_field,cases
0,2017-01-12,Argentina-Buenos_Aires,province,cumulative_confirmed_imported_cases,0.0
1,2017-01-12,Argentina-CABA,province,cumulative_confirmed_imported_cases,1.0
2,2017-01-12,Argentina-Cordoba,province,cumulative_confirmed_imported_cases,2.0
3,2017-01-12,Argentina-Entre_Rios,province,cumulative_confirmed_imported_cases,0.0
4,2017-01-12,Argentina-Santa_Fe,province,cumulative_confirmed_imported_cases,2.0


In [65]:
zika.shape

(242450, 5)

In [67]:
# filter out those rows which does not contain any data
zika = zika.dropna(how = 'all')
  
# Filter all rows for which the report_date is equal to 18437
zika.drop(zika[zika['report_date'] == '18437'].index, inplace = True)

# covert string to an float
zika['cases'] = zika['cases'].astype(float)

# Replace nan value with 0,
zika['cases'] = zika['cases'].fillna(0)

# covert float to an int
zika['cases'] = zika['cases'].astype(int)

# Convert our report_date columns to datetime datatype
zika['report_date'] = pd.to_datetime(zika['report_date'], format="%Y-%m-%d")

# create new year and month column from datetime
zika['year'] = pd.DatetimeIndex(zika['report_date']).year
zika['month'] = pd.DatetimeIndex(zika['report_date']).month

In [68]:
zika.head()

Unnamed: 0,report_date,location,location_type,data_field,cases,year,month
0,2017-01-12,Argentina-Buenos_Aires,province,cumulative_confirmed_imported_cases,0,2017.0,1.0
1,2017-01-12,Argentina-CABA,province,cumulative_confirmed_imported_cases,1,2017.0,1.0
2,2017-01-12,Argentina-Cordoba,province,cumulative_confirmed_imported_cases,2,2017.0,1.0
3,2017-01-12,Argentina-Entre_Rios,province,cumulative_confirmed_imported_cases,0,2017.0,1.0
4,2017-01-12,Argentina-Santa_Fe,province,cumulative_confirmed_imported_cases,2,2017.0,1.0


# Population Density Dataset
Dataset contain population density per km of 14 countries

In [76]:
popden = pd.read_csv('06_population_density.csv')
popden.head()

Unnamed: 0,location,density_per_km
0,Argentina-Buenos_Aires,12625.800781
1,Argentina-CABA,12625.800781
2,Argentina-Cordoba,2404.108887
3,Argentina-Entre_Rios,72.495293
4,Argentina-Santa_Fe,208.092285


In [77]:
popden.shape

(1714, 2)

## Location (countries) wise total cases

In [115]:
# Location (countries) wise total cases
zika_location = zika.groupby(['location']).cases.agg([sum])
zika_location = zika_location.reset_index()
zika_location.head()

Unnamed: 0,location,sum
0,Argentina-Buenos_Aires,793
1,Argentina-CABA,647
2,Argentina-Catamarca,59
3,Argentina-Chaco,460
4,Argentina-Chubut,47


In [116]:
# location wise population density_per_km 
zika_location = pd.merge(zika_location, popden, on=['location'], how='left') 
zika_location.head()

Unnamed: 0,location,sum,density_per_km
0,Argentina-Buenos_Aires,793,12625.800781
1,Argentina-CABA,647,12625.800781
2,Argentina-Catamarca,59,460.153595
3,Argentina-Chaco,460,121.33165
4,Argentina-Chubut,47,37.095642


In [117]:
zika_location.isnull().sum()

location            0
sum                 0
density_per_km    122
dtype: int64

In [118]:
zika_location['density_per_km'].median()

70.99121856689453

In [119]:
zika_location['density_per_km'].fillna(zika_location['density_per_km'].median(), inplace=True)

In [120]:
zika_location.isnull().sum()

location          0
sum               0
density_per_km    0
dtype: int64

In [124]:
zika_location = zika_location.rename(columns = {'sum': 'cases'}, inplace = False)
zika_location.to_csv('Data/ZikaWithDensity.csv', index=False )