# Title 

## Data Cleaning  
---

In this notebook

### Table of content
---

### Import packages 

In [1]:
#Import miscellaneous
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Import processing tools
import pandas as pd 

### First look

In [2]:
#Reading measles report data set 
outbreak = pd.read_csv('../data/ECDC_surveillance_data_Measles_reports.csv')
outbreak.head()

Unnamed: 0,HealthTopic,Population,Indicator,Unit,Time,RegionCode,RegionName,NumValue,TxtValue
0,Measles,All cases,Notification rate,N/1000000,1999-01,AT,Austria,0.25054929,
1,Measles,All cases,Notification rate,N/1000000,1999-01,AT,Austria,0.25054929,
2,Measles,All cases,Notification rate,N/1000000,1999-01,DK,Denmark,0.18819714,
3,Measles,All cases,Notification rate,N/1000000,1999-01,DK,Denmark,0.18819714,
4,Measles,All cases,Notification rate,N/1000000,1999-01,EL,Greece,1.58172375,


In [3]:
#Reading vaccination report data set
vaccine = pd.read_csv('../data/ECDC_surveillance_data_Measles.csv')
vaccine.head()

Unnamed: 0,HealthTopic,Population,Indicator,Unit,Time,RegionCode,RegionName,NumValue,TxtValue
0,Measles,Vaccination coverage,Vaccination coverage first dose,%,1999,AT,Austria,65.0,
1,Measles,Vaccination coverage,Vaccination coverage first dose,%,1999,AT,Austria,65.0,
2,Measles,Vaccination coverage,Vaccination coverage first dose,%,1999,BE,Belgium,82.0,
3,Measles,Vaccination coverage,Vaccination coverage first dose,%,1999,BE,Belgium,82.0,
4,Measles,Vaccination coverage,Vaccination coverage first dose,%,1999,BG,Bulgaria,96.0,


Checking the size of each data frame

In [4]:
outbreak.shape

(51416, 9)

In [5]:
vaccine.shape

(2222, 9)

In [6]:
outbreak['Time'].head()

0    1999-01
1    1999-01
2    1999-01
3    1999-01
4    1999-01
Name: Time, dtype: object

In [7]:
vaccine['Time'].head()

0    1999
1    1999
2    1999
3    1999
4    1999
Name: Time, dtype: int64

**Observation**:

In [8]:
outbreak['Time'] = pd.to_datetime(outbreak['Time'])
vaccine['Time'] = pd.to_datetime(vaccine['Time'], format = '%Y')

In [9]:
outbreak['Time'].head()

0   1999-01-01
1   1999-01-01
2   1999-01-01
3   1999-01-01
4   1999-01-01
Name: Time, dtype: datetime64[ns]

In [10]:
vaccine['Time'].head()

0   1999-01-01
1   1999-01-01
2   1999-01-01
3   1999-01-01
4   1999-01-01
Name: Time, dtype: datetime64[ns]

**Observation:**

In order to accelerate the cleaning process, both data frames are going to be combined.

In [11]:
measles = pd.concat([outbreak,vaccine])

### Cleaning process

#### Dropping Duplicates

In [12]:
measles.duplicated().sum()

26819

In [13]:
measles.drop_duplicates(inplace=True)

#### Renaming columns

In [14]:
measles.columns = measles.columns.str.lower()
measles.head()

Unnamed: 0,healthtopic,population,indicator,unit,time,regioncode,regionname,numvalue,txtvalue
0,Measles,All cases,Notification rate,N/1000000,1999-01-01,AT,Austria,0.25054929,
2,Measles,All cases,Notification rate,N/1000000,1999-01-01,DK,Denmark,0.18819714,
4,Measles,All cases,Notification rate,N/1000000,1999-01-01,EL,Greece,1.58172375,
6,Measles,All cases,Notification rate,N/1000000,1999-01-01,EU_EEA31,EU/EEA,0.822608,
8,Measles,All cases,Notification rate,N/1000000,1999-01-01,FI,Finland,0.0,


#### Checking NaN's 

In [15]:
measles.isnull().sum()

healthtopic        0
population         0
indicator          0
unit               0
time               0
regioncode         0
regionname         0
numvalue           0
txtvalue       26819
dtype: int64

In [16]:
measles.loc[measles['numvalue'] == '-', 'numvalue'].count()

771

**Observation**: 

In [17]:
measles.loc[measles['numvalue'] == '-', 'numvalue'] = None
measles.isnull().sum()

healthtopic        0
population         0
indicator          0
unit               0
time               0
regioncode         0
regionname         0
numvalue         771
txtvalue       26819
dtype: int64

#### Dropping uninformative columns

In [18]:
measles.drop(columns=['txtvalue', 'healthtopic','population'],inplace=True)

#### Checking data types

In [19]:
measles.dtypes

indicator             object
unit                  object
time          datetime64[ns]
regioncode            object
regionname            object
numvalue              object
dtype: object

**Observation:** 

In [20]:
measles['numvalue'] = pd.to_numeric(measles['numvalue'])

In [21]:
measles.dtypes

indicator             object
unit                  object
time          datetime64[ns]
regioncode            object
regionname            object
numvalue             float64
dtype: object

NEXT STEP

In [22]:
#Save as a new CSV file
measles.to_csv('../data/measles.csv', index=False)