# Data pre-processing and wrangling for cancer incidence data and cancer mortality data

There are two datasets about cancer registrations by dhb and cancer death by dhb.   
In this part, we cleaned and reformatted these two datasets and output two clean csv files 'incidence.csv' and 'mortality.csv'.  
To support our following research, we filtered 'Cancer type' and accoding 'sex' group into sexfiltered files.

In [1]:
using CSV, DataFrames, StringDistances, StatsBase, Plots
#### 2. Data Cleaning ####

#### 2.1. Cancer data ####
#### 2.1.1 Incidence ####

# Load data
incidence = CSV.File("D:/cancer-registrations-by-dhb.csv") |> DataFrame

# Process character data
incidence = incidence[incidence.DHB .!= "Overseas and undefined", :]
incidence[!, :cancer] .= replace.(incidence[!, :"Cancer type"], r"\s\([^)]+\)" .=> "")

# Rename column names
rename!(incidence, :Year => :year, :Sex => :sex, :Number => :incidence_num, :Rate => :incidence_rate)

# Pre-process for data format
incidence = select(incidence, [:DHB, :year, :sex, :cancer, :incidence_num, :incidence_rate])
incidence[!, :incidence_rate] .= replace.(incidence[!, :incidence_rate], "S" => "0")
incidence[!, :incidence_rate] .= parse.(Float64, incidence[!, :incidence_rate])

incidence

Row,DHB,year,sex,cancer,incidence_num,incidence_rate
Unnamed: 0_level_1,String,Int64,String,String,Int64,Float64
1,All New Zealand,2011,Male,Bladder,234,7.3
2,Auckland,2011,Male,Bladder,17,6.7
3,Bay of Plenty,2011,Male,Bladder,14,6.9
4,Canterbury,2011,Male,Bladder,32,8.1
5,Capital & Coast,2011,Male,Bladder,13,7.5
6,Counties Manukau,2011,Male,Bladder,14,5.2
7,Hawke's Bay,2011,Male,Bladder,14,11.5
8,Hutt Valley,2011,Male,Bladder,10,9.8
9,Lakes,2011,Male,Bladder,5,0.0
10,MidCentral,2011,Male,Bladder,10,7.7


In [2]:
# Filter incidence based on sex
incidence_cancer_sex = incidence[incidence[:,:DHB] .== "All New Zealand", :]
grouped_incidence = groupby(incidence_cancer_sex, [:year, :cancer, :sex])
rate_mean = combine(grouped_incidence, :incidence_rate => (x -> mean(skipmissing(x))) => :rate_mean)
incidence_wider = unstack(rate_mean, :sex, :rate_mean)
incidence_sex_filtered = filter(row -> !(ismissing(row[:Male]) && ismissing(row[:Female])), incidence_wider)

group = map(eachrow(incidence_sex_filtered)) do row
    if ismissing(row[:Male]) && !ismissing(row[:Female])
        "Female"
    elseif !ismissing(row[:Male]) && ismissing(row[:Female])
        "Male"
    else
        "AllSex"
    end
end

incidence_sex_filtered[!, :group]= group
select!(incidence_sex_filtered, :cancer, :group)
unique!(incidence_sex_filtered)

incidence_sex_filtered = innerjoin(incidence, incidence_sex_filtered, on = [:cancer, :sex => :group])
incidence_sex_filtered[:,:group] = incidence_sex_filtered.sex

incidence_sex_filtered

Row,DHB,year,sex,cancer,incidence_num,incidence_rate,group
Unnamed: 0_level_1,String,Int64,String,String,Int64,Float64,String
1,All New Zealand,2011,Male,Bladder,234,7.3,Male
2,Auckland,2011,Male,Bladder,17,6.7,Male
3,Bay of Plenty,2011,Male,Bladder,14,6.9,Male
4,Canterbury,2011,Male,Bladder,32,8.1,Male
5,Capital & Coast,2011,Male,Bladder,13,7.5,Male
6,Counties Manukau,2011,Male,Bladder,14,5.2,Male
7,Hawke's Bay,2011,Male,Bladder,14,11.5,Male
8,Hutt Valley,2011,Male,Bladder,10,9.8,Male
9,Lakes,2011,Male,Bladder,5,0.0,Male
10,MidCentral,2011,Male,Bladder,10,7.7,Male


In [3]:
#### 2.1.2 Mortality ####

# Load data
mortality = CSV.File("D:/cancer-deaths-by-dhb.csv") |> DataFrame

# Process character data
mortality = mortality[(mortality.DHB .!= "Overseas and undefined") .& (mortality."Cancer type" .!="Unspecified site"), :]
mortality[!, :cancer] .= replace.(mortality[!, :"Cancer type"], r"\s\([^)]+\)" .=> "")

# Rename column names
rename!(mortality, :Year => :year, :Sex => :sex, :Number => :mortality_num, :Rate => :mortality_rate)

# Pre-process for data format
mortality = select(mortality, [:DHB, :year, :sex, :cancer, :mortality_num, :mortality_rate])
mortality[!, :mortality_rate] .= replace.(mortality[!, :mortality_rate], "S" => "0")
mortality[!, :mortality_rate] .= parse.(Float64, mortality[!, :mortality_rate])
mortality

Row,DHB,year,sex,cancer,mortality_num,mortality_rate
Unnamed: 0_level_1,String,Int64,String,String,Int64,Float64
1,All New Zealand,2011,Male,Brain,135,4.9
2,Auckland,2011,Male,Brain,13,5.2
3,Bay of Plenty,2011,Male,Brain,7,5.5
4,Canterbury,2011,Male,Brain,19,5.7
5,Capital & Coast,2011,Male,Brain,6,3.6
6,Counties Manukau,2011,Male,Brain,10,4.1
7,Hawke's Bay,2011,Male,Brain,4,0.0
8,Hutt Valley,2011,Male,Brain,4,0.0
9,Lakes,2011,Male,Brain,3,0.0
10,MidCentral,2011,Male,Brain,4,0.0


In [4]:

# Filter incidence based on sex
mortality_cancer_sex = mortality[mortality[:,:DHB] .== "All New Zealand", :]
grouped_mortality = groupby(mortality_cancer_sex, [:year, :cancer, :sex])
rate_mean = combine(grouped_mortality, :mortality_rate => (x -> mean(skipmissing(x))) => :rate_mean)
mortality_wider = unstack(rate_mean, :sex, :rate_mean)
mortality_sex_filtered = filter(row -> !(ismissing(row[:Male]) && ismissing(row[:Female])), mortality_wider)

group = map(eachrow(mortality_sex_filtered)) do row
    if ismissing(row[:Male]) && !ismissing(row[:Female])
        "Female"
    elseif !ismissing(row[:Male]) && ismissing(row[:Female])
        "Male"
    else
        "AllSex"
    end
end

mortality_sex_filtered[!, :group]= group
select!(mortality_sex_filtered, :cancer, :group)
unique!(mortality_sex_filtered)

mortality_sex_filtered = innerjoin(mortality, mortality_sex_filtered, on = [:cancer, :sex => :group])
mortality_sex_filtered[:,:group] = mortality_sex_filtered.sex

mortality_sex_filtered

Row,DHB,year,sex,cancer,mortality_num,mortality_rate,group
Unnamed: 0_level_1,String,Int64,String,String,Int64,Float64,String
1,All New Zealand,2011,Male,Liver,164,5.3,Male
2,Auckland,2011,Male,Liver,20,8.0,Male
3,Bay of Plenty,2011,Male,Liver,12,6.9,Male
4,Canterbury,2011,Male,Liver,16,4.1,Male
5,Capital & Coast,2011,Male,Liver,13,7.3,Male
6,Counties Manukau,2011,Male,Liver,15,5.6,Male
7,Hawke's Bay,2011,Male,Liver,6,4.7,Male
8,Hutt Valley,2011,Male,Liver,5,0.0,Male
9,Lakes,2011,Male,Liver,6,8.8,Male
10,MidCentral,2011,Male,Liver,4,0.0,Male


In [5]:
# Write clean data to csv files
CSV.write("data/clean/incidence.csv", incidence)
CSV.write("data/clean/incidence_sexfiltered.csv", incidence_sex_filtered)
CSV.write("data/clean/mortality.csv", mortality)
CSV.write("data/clean/mortality_sexfiltered.csv", mortality_sex_filtered)

"data/clean/mortality_sexfiltered.csv"