# Preparing the ambient

First, we need prepare the packages that we will use.
The packages are "DataFrames" and "CSV.


In [1]:
using Pkg

Pkg.add("DataFrames")
Pkg.add("CSV")

[32m[1m   Updating[22m[39m registry at `C:\Users\GT\.julia\registries\General`
[32m[1m   Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`






[32m[1m  Resolving[22m[39m package versions...
[32m[1m  Installed[22m[39m PooledArrays ────── v0.5.3
[32m[1m  Installed[22m[39m InvertedIndices ─── v1.0.0
[32m[1m  Installed[22m[39m CategoricalArrays ─ v0.8.1
[32m[1m  Installed[22m[39m DataFrames ──────── v0.21.5
[32m[1m   Updating[22m[39m `C:\Users\GT\.julia\environments\v1.4\Project.toml`
 [90m [a93c6f00][39m[92m + DataFrames v0.21.5[39m
[32m[1m   Updating[22m[39m `C:\Users\GT\.julia\environments\v1.4\Manifest.toml`
 [90m [324d7699][39m[92m + CategoricalArrays v0.8.1[39m
 [90m [a93c6f00][39m[92m + DataFrames v0.21.5[39m
 [90m [41ab1584][39m[92m + InvertedIndices v1.0.0[39m
 [90m [2dfb63ee][39m[92m + PooledArrays v0.5.3[39m
 [90m [9fa8497b][39m[92m + Future [39m
[32m[1m  Resolving[22m[39m package versions...
[32m[1m  Installed[22m[39m SentinelArrays ─ v1.2.9
[32m[1m  Installed[22m[39m CSV ──────────── v0.7.5
[32m[1m   Updating[22m[39m `C:\Users\GT\.julia\environments\

In [2]:
using DataFrames
using CSV
using Plots

┌ Info: Precompiling DataFrames [a93c6f00-e57d-5684-b7b6-d8193f3e46c0]
└ @ Base loading.jl:1260
┌ Info: Precompiling CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b]
└ @ Base loading.jl:1260


# Getting the Cleveland Dataset:
This function import and treat the Cleveland dataset

```JULIA
function GetCleveland(path::String)
```
> Path: Path to the processed cleveland file


# Explaining

> We dropped all missing data because it's just 6 instances


## Links
[Link to the oficial page](https://archive.ics.uci.edu/ml/datasets/Heart+Disease)

In [3]:
function GetCleveland(path::String)
    # Reading the dataset
    cleveland = DataFrame(CSV.File(input_clvnd))

    # Setting name in every column
    rename!(cleveland, [:age, :sex, :cp, :trestbps, :chol, :fbs, :restecg, :thalach, :exang, :oldpeak, :slope, :ca, :thal, :alvo])

    # Allowing missing data and setting missing
    allowmissing!(cleveland)
    cleveland .= ifelse.(cleveland .== "?", missing, cleveland)
    dropmissing!(cleveland)

    # Changing ca and thal types (defaulted as string)
    cleveland.ca = parse.(Float16, cleveland.ca)
    cleveland.thal = parse.(Float16, cleveland.thal)
    
    # Making alvo binary
    cleveland.alvo = map(a -> a == 0 ? 0 : 1, cleveland.alvo)

    return cleveland
end

GetCleveland (generic function with 1 method)

# Categorical Data

In [4]:
function show_categorical(array::Array{<:Number, 1}, types::Array{<:Number, 1}, labels::Array{<:String, 1}, title::String; legend = :right)

    tempProportion = zeros(length(types))
    totalProportion = length(array)
    for i = 1:length(types)
        tempProportion[i] = length(filter(a -> a == types[i] , array)) / totalProportion
        labels[i] *= " $(round(tempProportion[i] * 100, digits=2))%"
    end
    return pie(labels, tempProportion; title=title, legend = legend)

end

function PlotCategoricalAttributes(cleveland::DataFrame, path::String)    
    # SEX
    h1 = show_categorical(cleveland.sex, [0, 1], ["Women", "Men"], "Sex Proportion in Dataset")
    
    # CP
    h2 = show_categorical(cleveland.cp, [1, 2, 3, 4], ["Typical Angina", "Atypical Angina", "Non-anginal Pain", "Asymptomatic"], "Chest Pain Type"; legend=:left)
      
    # FBS
    h3 = show_categorical(cleveland.fbs, [0, 1], ["False", "True"], "Fasting Blood Sugar > 120 mg/dl")
    
    # restecg
    h4 = show_categorical(cleveland.restecg, [0, 1, 2], ["Normal", "Abnormality", "Left Ventricular Hypertrophy"], " Resting Electrocardiographic Results"; legend=:left)
    
    # exang
    h5 = show_categorical(cleveland.exang, [0, 1], ["False", "True"], "Exercise Induced Angina")
    
    # slope
    h6 = show_categorical(cleveland.slope, [1, 2, 3], ["Upsloping", "Flat", "Downsloping"], "Slope"; legend=:left)
    
    # thal
    h7 = show_categorical(cleveland.thal, [3, 6, 7], ["Normal", "Fixed Defect", "Reversable Defect"], "Thal")
    
    # alvo
    h8 = show_categorical(cleveland.alvo, [0, 1], ["False", "True"], "diagnosis of heart disease", legend=:left)
    
    png( plot(h1, h2, h3, h4, h5, h6, h7, h8, layout=(4,2), size=(500 * 2 + 1000, 500 * 4)), path)
    
end

PlotCategoricalAttributes (generic function with 1 method)

# Numerical Data

In [5]:
function PlotNumericalAttributes(cleveland::DataFrame, path::String)
    h1 = histogram(cleveland.age, title="Range of age", labels="Age")
    h2 = histogram(cleveland.trestbps, title="Range of trestbps", labels="trestbps")
    h3 = histogram(cleveland.chol, title="Range of chol", labels="chol")
    h4 = histogram(cleveland.thalach, title="Range of thalach", labels="thalach")
    h5 = histogram(cleveland.oldpeak, title="Range of oldpeak", labels="oldpeak")
    h6 = histogram(cleveland.ca, title="Range of ca", labels="ca")
    
    png( plot(h1, h2, h3, h4, h5, h6, layout=(3, 2), size=(500 * 2, 500 * 3)), path)
end

PlotNumericalAttributes (generic function with 1 method)

# Scatter

In [6]:
function ScatterCleveland(cleveland::DataFrame, path::String)
    # SPLIT
    cleveland_1 = filter(row -> row.alvo == 1, cleveland)
    cleveland_0 = filter(row -> row.alvo != 1, cleveland)
        
    # SCATTER
    
    # AGE PER Y
    h2 = plot(sort(cleveland_1.age), sort(cleveland_1.chol), seriestype=:scatter, labels="Sick", xlabel = "Age", ylabel = "Chol", legend=:left)
    plot!(sort(cleveland_0.age), sort(cleveland_0.chol), seriestype=:scatter, labels="Helf")
    
    h3 = plot(sort(cleveland_1.age), sort(cleveland_1.thalach), seriestype=:scatter, labels="Sick", xlabel = "Age", ylabel = "Thalach", legend=:left)
    plot!(sort(cleveland_0.age), sort(cleveland_0.thalach), seriestype=:scatter, labels="Dboa")
    
    h4 = plot(sort(cleveland_1.age), sort(cleveland_1.oldpeak), seriestype=:scatter, labels="Sick", xlabel = "Age", ylabel = "Oldpeak", legend=:left)
    plot!(sort(cleveland_0.age), sort(cleveland_0.oldpeak), seriestype=:scatter, labels="Dboa")
    
    h5 = plot(sort(cleveland_1.oldpeak), sort(cleveland_1.thalach), seriestype=:scatter, labels="Sick", xlabel = "Oldpeak", ylabel = "thalach")
    plot!(sort(cleveland_0.oldpeak), sort(cleveland_0.thalach), seriestype=:scatter, labels="Dboa", xlabel = "Oldpeak", ylabel = "thalach")
    
    png( plot(h2, h3, h4, h5, layout=(2, 2), size=(500 * 2 + 200, 500 * 3)), path)
end

ScatterCleveland (generic function with 1 method)

# Tests

In [8]:
input_clvnd = "C:/Users/GT/Desktop/Grupo Pesquisa/Lista 2-Exercícios ML/processed.cleveland.data"

cleveland = GetCleveland(input_clvnd)
# ALL DATA

# Plot Categorical
PlotCategoricalAttributes(cleveland, "Categorical.png")

# Plot Numerical
PlotNumericalAttributes(cleveland, "Numerical.png")

# Scatter
ScatterCleveland(cleveland, "Scatter.png")

In [9]:
# Describing
describe(cleveland)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Abstrac…,Real,Float64,Real,Nothing,Nothing,DataType
1,age,54.5135,29.0,56.0,77.0,,,Float64
2,sex,0.675676,0.0,1.0,1.0,,,Float64
3,cp,3.16554,1.0,3.0,4.0,,,Float64
4,trestbps,131.649,94.0,130.0,200.0,,,Float64
5,chol,247.399,126.0,243.0,564.0,,,Float64
6,fbs,0.141892,0.0,0.0,1.0,,,Float64
7,restecg,0.993243,0.0,1.0,2.0,,,Float64
8,thalach,149.598,71.0,153.0,202.0,,,Float64
9,exang,0.327703,0.0,0.0,1.0,,,Float64
10,oldpeak,1.05135,0.0,0.8,6.2,,,Float64
