In [1]:
using CSV, DataFrames, Gadfly, GLM, Statistics, LinearAlgebra, Distributions, Combinatorics, StatsBase, MLBase, Random

In [2]:
include("functions.jl");

In [3]:
data = CSV.read("train1.csv", DataFrame)
test = CSV.read("test1.csv", DataFrame)
y = data[:, :HeartDisease]
n = length(y)
first(data,5)
first(test,5)

Unnamed: 0_level_0,ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG
Unnamed: 0_level_1,Int64,Int64,String,String,Int64,Int64?,Int64,String
1,459,68,M,TA,139,181,1,ST
2,460,50,F,ASY,160,missing,1,Normal
3,461,50,M,ATA,120,168,0,Normal
4,462,64,M,NAP,125,309,0,Normal
5,463,35,M,ATA,120,308,0,LVH


# Data Cleaning (Without missing rows)

In [4]:
countmap(data.HeartDisease)

Dict{Int64,Int64} with 2 entries:
  0 => 218
  1 => 240

In [5]:
cholesterol_mean = floor(Int, mean(skipmissing(data.Cholesterol)))
data[!, :Cholesterol] = coalesce.(data.Cholesterol, cholesterol_mean);
first(data, 5)

Unnamed: 0_level_0,ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG
Unnamed: 0_level_1,Int64,Int64,String,String,Int64,Int64,Int64,String
1,1,69,M,ASY,140,110,1,Normal
2,2,60,M,ASY,140,293,0,LVH
3,3,52,M,ASY,165,249,1,Normal
4,4,46,M,NAP,120,230,0,Normal
5,5,61,M,NAP,120,249,0,Normal


In [6]:
data.Cholesterol = replace(data.Cholesterol, missing => cholesterol_mean);

In [7]:
sex = zeros(Int, n)

for i in 1:n
    if (data.Sex[i] == "M")
        sex[i] = 1;
    end
end
data[!, :Sex] = sex;
first(data, 5)

Unnamed: 0_level_0,ID,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG
Unnamed: 0_level_1,Int64,Int64,Int64,String,Int64,Int64,Int64,String
1,1,69,1,ASY,140,110,1,Normal
2,2,60,1,ASY,140,293,0,LVH
3,3,52,1,ASY,165,249,1,Normal
4,4,46,1,NAP,120,230,0,Normal
5,5,61,1,NAP,120,249,0,Normal


In [8]:
ChestPainType1 = zeros(Int, n)
ChestPainType2 = zeros(Int, n)
ChestPainType3 = zeros(Int, n)

for i in 1:n
    if (data.ChestPainType[i] == "ATA")
        ChestPainType1[i] = 1;
    elseif (data.ChestPainType[i] == "NAP")
        ChestPainType2[i] = 1; 
    elseif (data.ChestPainType[i] == "ASY")
        ChestPainType3[i] = 1;
    end
end
select!(data, Not(:ChestPainType))
data[!, :ChestPainType1] = ChestPainType1;
data[!, :ChestPainType2] = ChestPainType2;
data[!, :ChestPainType3] = ChestPainType3;
first(data, 3)

Unnamed: 0_level_0,ID,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,String,Int64,String
1,1,69,1,140,110,1,Normal,109,Y
2,2,60,1,140,293,0,LVH,170,N
3,3,52,1,165,249,1,Normal,122,Y


In [9]:
restingECG1 = zeros(Int, n)
restingECG2 = zeros(Int, n)

for i in 1:n
    if (data.RestingECG[i] == "ST")
        restingECG1[i] = 1;
    elseif (data.RestingECG[i] == "LVH")
        restingECG2[i] = 1;  
    end
end
select!(data, Not(:RestingECG))
data[!, :RestingECG1] = restingECG1;
data[!, :RestingECG2] = restingECG2;
first(data, 3)

Unnamed: 0_level_0,ID,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,String,Float64
1,1,69,1,140,110,1,109,Y,1.5
2,2,60,1,140,293,0,170,N,1.2
3,3,52,1,165,249,1,122,Y,1.0


In [10]:
exerciseAngina = zeros(Int, n)

for i in 1:n
    if (data.ExerciseAngina[i] == "Y")
        exerciseAngina[i] = 1;
    end
end
data[!, :ExerciseAngina] = exerciseAngina;

In [11]:
STSlope1 = zeros(Int, n)
STSlope2 = zeros(Int, n)

for i in 1:n
    if (data.STSlope[i] == "Flat")
        STSlope1[i] = 1;
    elseif (data.STSlope[i] == "Down")
        STSlope2[i] = 1;  
    end
end
select!(data, Not(:STSlope))
data[!, :STSlope1] = STSlope1;
data[!, :STSlope2] = STSlope2;



In [12]:
variables = propertynames(select(data, Not([:ID, :HeartDisease])))
variables

15-element Array{Symbol,1}:
 :Age
 :Sex
 :RestingBP
 :Cholesterol
 :FastingBS
 :MaxHR
 :ExerciseAngina
 :Oldpeak
 :ChestPainType1
 :ChestPainType2
 :ChestPainType3
 :RestingECG1
 :RestingECG2
 :STSlope1
 :STSlope2

In [None]:
distribution_types = Dict(
    :Age => [Normal{Float64}, Normal{Float64}],
    :Sex => [Normal{Float64}, Normal{Float64}],
    :Sex => [Normal{Float64}, Normal{Float64}],
    :RestingBP => [Normal{Float64}, Normal{Float64}],
    :Cholesterol => [Normal{Float64}, Normal{Float64}],
    :FastingBS => [Normal{Float64}, Normal{Float64}],
    :MaxHR => [Normal{Float64}, Normal{Float64}],
    :ExerciseAngina => [Normal{Float64}, Normal{Float64}],
    :Oldpeak => [Normal{Float64}, Normal{Float64}],
    :ChestPainType1 => [Normal{Float64}, Normal{Float64}],
    :ChestPainType2 => [Normal{Float64}, Normal{Float64}],
    :ChestPainType3 => [Normal{Float64}, Normal{Float64}],
    :RestingECG1 => [Normal{Float64}, Normal{Float64}],
    :RestingECG2 => [Normal{Float64}, Normal{Float64}],
    :STSlope1 => [Normal{Float64}, Normal{Float64}],
    :STSlope2 => [Normal{Float64}, Normal{Float64}],
)

In [13]:
HeartDisease_Positive = filter(row -> row.Duration != 0, train)
HeartDisease_Negative = filter(row -> row.Duration == 0, train)

n₀ = size(HeartDisease_Negative, 1)
n₁ = size(HeartDisease_Positive, 1)
n = n₁ + n₀

(α,β) = (1,1)
p₁ = (α + n₁)/(α + β + n)
p₀ = (β + n₀)/(α + β + n)

LoadError: UndefVarError: train not defined