# Use Package

In [6]:
using CSV
using DataFrames
using PyPlot
using Metrics
using Random

In [7]:
using ScikitLearn, Random, Statistics
using MLJ
using DecisionTree

# Import Data

In [4]:
imported_df = CSV.read("Data\\water_potability.csv", DataFrame)

Unnamed: 0_level_0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon
Unnamed: 0_level_1,Float64?,Float64,Float64,Float64,Float64?,Float64,Float64
1,missing,204.89,20791.3,7.30021,368.516,564.309,10.3798
2,3.71608,129.423,18630.1,6.63525,missing,592.885,15.18
3,8.09912,224.236,19909.5,9.27588,missing,418.606,16.8686
4,8.31677,214.373,22018.4,8.05933,356.886,363.267,18.4365
5,9.09222,181.102,17979.0,6.5466,310.136,398.411,11.5583
6,5.58409,188.313,28748.7,7.54487,326.678,280.468,8.39973
7,10.2239,248.072,28749.7,7.51341,393.663,283.652,13.7897
8,8.63585,203.362,13672.1,4.56301,303.31,474.608,12.3638
9,missing,118.989,14285.6,7.80417,268.647,389.376,12.706
10,11.1803,227.231,25484.5,9.0772,404.042,563.885,17.9278


# Data Cleaning

In [5]:
main_df = dropmissing(imported_df)

Unnamed: 0_level_0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,8.31677,214.373,22018.4,8.05933,356.886,363.267,18.4365
2,9.09222,181.102,17979.0,6.5466,310.136,398.411,11.5583
3,5.58409,188.313,28748.7,7.54487,326.678,280.468,8.39973
4,10.2239,248.072,28749.7,7.51341,393.663,283.652,13.7897
5,8.63585,203.362,13672.1,4.56301,303.31,474.608,12.3638
6,11.1803,227.231,25484.5,9.0772,404.042,563.885,17.9278
7,7.36064,165.521,32452.6,7.5507,326.624,425.383,15.5868
8,7.11982,156.705,18730.8,3.60604,282.344,347.715,15.9295
9,6.34727,186.733,41065.2,9.6296,364.488,516.743,11.5398
10,9.18156,273.814,24041.3,6.90499,398.351,477.975,13.3873


# Train-Test

In [11]:
function train_test_split(X, y, test_size)
    n = nrow(X)
    indices = shuffle(1:n)
    split_idx = Int(ceil((1 - test_size) * n))

    X_train = X[indices[1:split_idx], :]
    y_train = y[indices[1:split_idx]]

    X_test = X[indices[split_idx+1:end], :]
    y_test = y[indices[split_idx+1:end]]

    return X_train, X_test, y_train, y_test
end
test_size = 0.2
data_x = main_df[:,Not(["Potability"])]
data_y = main_df[:,     "Potability"]
X_train, X_test, y_train, y_test = train_test_split(data_x,    data_y,    test_size)

(1609×9 DataFrame. Omitted printing of 3 columns
│ Row  │ ph      │ Hardness │ Solids  │ Chloramines │ Sulfate │ Conductivity │
│      │ [90mFloat64[39m │ [90mFloat64[39m  │ [90mFloat64[39m │ [90mFloat64[39m     │ [90mFloat64[39m │ [90mFloat64[39m      │
├──────┼─────────┼──────────┼─────────┼─────────────┼─────────┼──────────────┤
│ 1    │ 4.22142 │ 120.56   │ 34370.2 │ 7.96089     │ 411.581 │ 436.775      │
│ 2    │ 9.40633 │ 216.762  │ 27948.6 │ 6.15611     │ 355.473 │ 347.983      │
│ 3    │ 6.13879 │ 158.422  │ 20130.0 │ 6.81214     │ 352.23  │ 373.844      │
│ 4    │ 6.51985 │ 183.226  │ 14284.6 │ 8.38908     │ 373.092 │ 495.418      │
│ 5    │ 6.12843 │ 141.492  │ 9615.83 │ 7.62792     │ 277.952 │ 415.009      │
│ 6    │ 7.88069 │ 226.004  │ 19486.9 │ 6.20857     │ 356.338 │ 472.369      │
│ 7    │ 7.2558  │ 200.164  │ 32595.1 │ 7.57244     │ 324.552 │ 489.272      │
│ 8    │ 6.68337 │ 272.112  │ 18989.3 │ 5.3362      │ 336.555 │ 307.725      │
│ 9    │ 7.34223 │ 198

# Modeling

In [20]:
# Create model
DT = DecisionTreeClassifier(max_depth=5)

# Train the model
ScikitLearn.fit!(DT, Matrix(X_train),  y_train)

DecisionTreeClassifier
max_depth:                5
min_samples_leaf:         1
min_samples_split:        2
min_purity_increase:      0.0
pruning_purity_threshold: 1.0
n_subfeatures:            0
classes:                  [0, 1]
root:                     Decision Tree
Leaves: 21
Depth:  5

In [24]:
# Prediction
y_pred_test = ScikitLearn.predict(DT, Matrix(X_test))

402-element Vector{Int64}:
 1
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0

In [28]:
df  = DataFrame(Predict = y_pred_test)
df2 = DataFrame(Actual = y_test)
df_export = hcat(df, df2)
CSV.write("Water_Quality_prediction2.csv", df_export) 

"Water_Quality_prediction2.csv"