## Import CSV

In [1]:
import pandas as pd
import numpy as np

In [2]:
wuenic = "../archive/2018_wuenic.csv"
wuenic_df = pd.read_csv(wuenic)
wuenic_df.head(5)

Unnamed: 0,Group,Subgroup,Name,Year,Vaccine,Coverage,Vaccinated,Target,Source
0,UNICEF Regions,ROSA,Afghanistan,2018,BCG,78,941000,1207000,WHO/UNICEF estimates of national immunization ...
1,UNICEF Regions,ROSA,Afghanistan,2018,DTP1,73,836000,1146000,WHO/UNICEF estimates of national immunization ...
2,UNICEF Regions,ROSA,Afghanistan,2018,DTP3,66,756000,1146000,WHO/UNICEF estimates of national immunization ...
3,UNICEF Regions,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000,WHO/UNICEF estimates of national immunization ...
4,UNICEF Regions,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000,WHO/UNICEF estimates of national immunization ...


## Remove columns: Group and Source 

In [3]:
wuenic_df.drop(['Group', 'Source'], axis=1, inplace= True)
wuenic_df

Unnamed: 0,Subgroup,Name,Year,Vaccine,Coverage,Vaccinated,Target
0,ROSA,Afghanistan,2018,BCG,78,941000,1207000
1,ROSA,Afghanistan,2018,DTP1,73,836000,1146000
2,ROSA,Afghanistan,2018,DTP3,66,756000,1146000
3,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000
4,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000
...,...,...,...,...,...,...,...
35569,ESAR,Zimbabwe,1997,DTP1,95,328000,345000
35570,ESAR,Zimbabwe,1997,DTP3,86,297000,345000
35571,ESAR,Zimbabwe,1997,Hepb3,16,55000,345000
35572,ESAR,Zimbabwe,1997,MCV1,84,290000,345000


# Remove characters: comma, symbols

In [None]:
wuenic_df['Vaccinated'] = wuenic_df['Vaccinated'].replace(',', '', regex = True)\
                        .replace('<', '', regex = True).replace('>', '', regex = True).astype('object')
wuenic_df

In [5]:
wuenic_df['Target'] = wuenic_df['Target'].replace(',', '', regex = True)\
                    .replace('<', '', regex = True).replace('>', '', regex = True).astype('object')
wuenic_df

Unnamed: 0,Subgroup,Name,Year,Vaccine,Coverage,Vaccinated,Target
0,ROSA,Afghanistan,2018,BCG,78,941000,1207000
1,ROSA,Afghanistan,2018,DTP1,73,836000,1146000
2,ROSA,Afghanistan,2018,DTP3,66,756000,1146000
3,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000
4,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000
...,...,...,...,...,...,...,...
35569,ESAR,Zimbabwe,1997,DTP1,95,328000,345000
35570,ESAR,Zimbabwe,1997,DTP3,86,297000,345000
35571,ESAR,Zimbabwe,1997,Hepb3,16,55000,345000
35572,ESAR,Zimbabwe,1997,MCV1,84,290000,345000


# Identify missing values: As shown in Vaccinated and Target

In [6]:
wuenic_df.loc[32948]

Subgroup        EAPR
Name          Tuvalu
Year            2007
Vaccine         RCV1
Coverage          95
Vaccinated         —
Target             —
Name: 32948, dtype: object

# Replace missing value symbol to None

In [7]:
wuenic_df['Vaccinated'].replace({'—': None},inplace =True)
wuenic_df['Target'].replace({'—': None},inplace =True)
wuenic_df

Unnamed: 0,Subgroup,Name,Year,Vaccine,Coverage,Vaccinated,Target
0,ROSA,Afghanistan,2018,BCG,78,941000,1207000
1,ROSA,Afghanistan,2018,DTP1,73,836000,1146000
2,ROSA,Afghanistan,2018,DTP3,66,756000,1146000
3,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000
4,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000
...,...,...,...,...,...,...,...
35569,ESAR,Zimbabwe,1997,DTP1,95,328000,345000
35570,ESAR,Zimbabwe,1997,DTP3,86,297000,345000
35571,ESAR,Zimbabwe,1997,Hepb3,16,55000,345000
35572,ESAR,Zimbabwe,1997,MCV1,84,290000,345000


# Check if missing values are changed to None. Example given below

In [8]:
wuenic_df.loc[32948]

Subgroup        EAPR
Name          Tuvalu
Year            2007
Vaccine         RCV1
Coverage          95
Vaccinated      None
Target          None
Name: 32948, dtype: object

# Drop null values and rename column "Name to "Country"

In [11]:
clean_df= wuenic_df.dropna().rename(columns={'Name': 'Country'})
clean_df

Unnamed: 0,Subgroup,Country,Year,Vaccine,Coverage,Vaccinated,Target
0,ROSA,Afghanistan,2018,BCG,78,941000,1207000
1,ROSA,Afghanistan,2018,DTP1,73,836000,1146000
2,ROSA,Afghanistan,2018,DTP3,66,756000,1146000
3,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000
4,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000
...,...,...,...,...,...,...,...
35569,ESAR,Zimbabwe,1997,DTP1,95,328000,345000
35570,ESAR,Zimbabwe,1997,DTP3,86,297000,345000
35571,ESAR,Zimbabwe,1997,Hepb3,16,55000,345000
35572,ESAR,Zimbabwe,1997,MCV1,84,290000,345000


# Insert index column with column names "Year, Vaccine and Country" as Index

In [15]:
Combined = clean_df['Year'].astype(str)+ clean_df['Vaccine']+clean_df['Country']
clean_df.insert(0, 'Index', Combined, allow_duplicates = False)

clean_df


Unnamed: 0,Index,Subgroup,Country,Year,Vaccine,Coverage,Vaccinated,Target
0,2018BCGAfghanistan,ROSA,Afghanistan,2018,BCG,78,941000,1207000
1,2018DTP1Afghanistan,ROSA,Afghanistan,2018,DTP1,73,836000,1146000
2,2018DTP3Afghanistan,ROSA,Afghanistan,2018,DTP3,66,756000,1146000
3,2018Hepb3Afghanistan,ROSA,Afghanistan,2018,Hepb3,66,756000,1146000
4,2018HepbbAfghanistan,ROSA,Afghanistan,2018,Hepbb,18,217000,1207000
...,...,...,...,...,...,...,...,...
35569,1997DTP1Zimbabwe,ESAR,Zimbabwe,1997,DTP1,95,328000,345000
35570,1997DTP3Zimbabwe,ESAR,Zimbabwe,1997,DTP3,86,297000,345000
35571,1997Hepb3Zimbabwe,ESAR,Zimbabwe,1997,Hepb3,16,55000,345000
35572,1997MCV1Zimbabwe,ESAR,Zimbabwe,1997,MCV1,84,290000,345000


# Re-organise columns 

In [25]:
clean_wuenic = pd.DataFrame(clean_df, columns= ['Index', 'Country', "Year","Vaccine", "Subgroup","Coverage",\
                                                "Vaccinated","Target"])
clean_wuenic

Unnamed: 0,Index,Country,Year,Vaccine,Subgroup,Coverage,Vaccinated,Target
0,2018BCGAfghanistan,Afghanistan,2018,BCG,ROSA,78,941000,1207000
1,2018DTP1Afghanistan,Afghanistan,2018,DTP1,ROSA,73,836000,1146000
2,2018DTP3Afghanistan,Afghanistan,2018,DTP3,ROSA,66,756000,1146000
3,2018Hepb3Afghanistan,Afghanistan,2018,Hepb3,ROSA,66,756000,1146000
4,2018HepbbAfghanistan,Afghanistan,2018,Hepbb,ROSA,18,217000,1207000
...,...,...,...,...,...,...,...,...
35569,1997DTP1Zimbabwe,Zimbabwe,1997,DTP1,ESAR,95,328000,345000
35570,1997DTP3Zimbabwe,Zimbabwe,1997,DTP3,ESAR,86,297000,345000
35571,1997Hepb3Zimbabwe,Zimbabwe,1997,Hepb3,ESAR,16,55000,345000
35572,1997MCV1Zimbabwe,Zimbabwe,1997,MCV1,ESAR,84,290000,345000


In [None]:
pd.clean_wuenic

# The End