# NoteBook To create all sorts of DataSets

Save all new datasets in `data/new datasets/`

All datasets should have their own subFolder with files 

`x_train` columns: All variable names with index 0-XXX in numbers (no arrays)

`y_train` column: y column of 0/1 boolean values

`x_pred` same as x_train

### IMPORTS

In [None]:
using CSV, DataFrames, Statistics, Dates, Gadfly, LinearAlgebra, Distributions, Random, ScikitLearn, GLM

# Fonctions Globales

In [None]:
"""
    splitdataframe(df::DataFrame, p::Real)

Partitionne en un ensemble d'entraînement et un ensemble de validation un DataFrame.

### Arguments
- `df::DataFrame` : Un DataFrame
- `p::Real` : La proportion (entre 0 et 1) de données dans l'ensemble d'entraînement.

### Détails

La fonction renvoie deux DataFrames, un pour l'ensemble d'entraînement et l'autre pour l'ensemble de validation.

### Exemple

\```
 julia> splitdataframe(df, p.7)
\```

"""
function splitdataframe(df::DataFrame, p::Real)
   @assert 0 <= p <= 1 
    
    n = size(df,1)
    
    ind = shuffle(1:n)
    
    threshold = Int64(round(n*p))
    
    indTrain = sort(ind[1:threshold])
    
    indTest = setdiff(1:n,indTrain)
    
    dfTrain = df[indTrain,:]
    dfTest = df[indTest,:]
    
    return dfTrain, dfTest
    
end

# Chargement des données et nettoyage préliminaire

## Chargement des surverses

In [None]:
data = CSV.read("./data/surverses.csv", missingstring="-99999")
first(data,5)

## Nettoyage des données sur les surverses

#### Extraction des surverses pour les mois de mai à octobre inclusivement

In [None]:
data = filter(row -> month(row.DATE) > 4, data) 
data = filter(row -> month(row.DATE) < 11, data) 
first(data,5)

#### Remplacement des valeurs *missing* dans la colonne :RAISON par "Inconnue"

In [None]:
raison = coalesce.(data[:,:RAISON],"Inconnue")
data[!,:RAISON] = raison
first(data,5)

#### Exlusion des surverses coccasionnées par d'autres facteurs que les précipitations liquides

Ces facteurs correspondent à : 
- la fonte de neige (F), 
- les travaux planifiés et entretien (TPL)
- urgence (U)
- autre (AUT)

In [None]:
data = filter(row -> row.RAISON ∈ ["P","Inconnue","TS"], data) 
select!(data, [:NO_OUVRAGE, :DATE, :SURVERSE])
first(data,5)

#### Exclusion des lignes où :SURVERSE est manquante

In [None]:
surverse_df = dropmissing(data, disallowmissing=true)
rename!(surverse_df, :DATE=>:date)
first(surverse_df,5)

## Chargement des précipitations

In [None]:
data = CSV.read("data/precipitations.csv",missingstring="-99999")
rename!(data, Symbol("St-Hubert")=>:StHubert)
first(data,5)

## Nettoyage des données sur les précipitations

#### Extraction des précipitations des mois de mai à octobre inclusivement

In [None]:
data = filter(row -> month(row.date) > 4, data) 
data = filter(row -> month(row.date) < 11, data) 
first(data,5)

 ### Remplissage des données manquantes
Nous allons tenter de remplir les données manquantes par des moyennes de précipitations lorsque les données sont inconnues pour 2 stations ou plus

# LIST OF TECHNIQUES TO FILL MISSING DATA

- use ridge regression to fill missing values (*TODO*)
- use mean of line to fill values

## Fonction Ridge pour trouver les missings values

In [None]:
# Fonction pour faire une regression ridge
# Ressort le beta, m, et s
function ridge(datas::DataFrame, station::Symbol)
       
    Train, Test = splitdataframe(datas, .75);
    # Prétraitement des données
    # Les variables avec les tildes correspondent à l'échantillon de test

    X = convert(Matrix{Int64},Train[:,Not(station)])
    m = mean(X, dims=1)
    s = std(X, dims=1)
    m[2] = 0
    s[2] = 1
    X = (X .- m) ./ s

    X̃ = convert(Matrix{Int64},Test[:,Not(station)])
    X̃ = (X̃ .- m) ./ s

    y = convert(Vector{Int64}, Train[:,station])
    m = mean(y)
    s = std(y)
    y = (y .- m) ./s

    ỹ = convert(Vector{Int64}, Test[:,station])
    ỹ = (ỹ .- m) ./s;

    #On calcule ensuite le RMSE pour chacun des valeurs de lambda
    RMSEs = DataFrame(λ=Float64[], RMSE=Float64[])

    for λ in 0:1:10000
   
        β̂ = (X'X + λ*I)\X'y
    
        ŷ = X̃*β̂
    
        ẽ = ỹ - ŷ
    
        RMSE = sqrt(dot(ẽ,ẽ)/length(ẽ))
    
        push!(RMSEs, [λ, RMSE])
    
    end
    
    # On trouve ensuite la valeure de lambda qui minimise le RMSE
    _, ind = findmin(RMSEs[:,:RMSE])

    λ̂ = RMSEs[ind,:λ]
    
    β̂ = (X'X + λ̂*I)\X'y
    
    #TODO validate model and print value of validator R² ajuste
    
    #On peut alors calculer les y avec les betas trouver et l'echantillon de test
    ŷ = X̃ * β̂
    ŷ = round.((ŷ .* s) .+ m)
    
    # Calcul du R² ajusté

    p = 4          # nombre de variables explicatives
    n = length(ỹ)  # taille de l'échantillon

    ỹ = (ỹ .* s) .+ m
    ȳ = mean(ỹ)
    e = ỹ - ŷ

    SST = sum( (ỹ[i] - ȳ)^2 for i=1:n )  # variabilité totale
    SSE = sum( e.^2 )                    # variabilité résiduelle

    R2aj =  1 - SSE/SST * (n-1)/(n-p)
    
    println("Le R² ajuste du modele trouve pour la station de $(station) est $(R2aj)")
    
    return β̂
end

In [None]:
# TODO Replace ridge fonction with SciKit fonction (check R square if better)

In [None]:
# We need the data with no missing values to build ridge models
full_data = dropmissing(data, disallowmissing=true)
full_data = full_data[:,Not(:date)][:, Not(:heure)]
size(full_data)

In [None]:
# List of betas by missing station
betas = DataFrame(station = Symbol[], β = Array{Float64}[])
for name in names(full_data)
    β̂ = ridge(full_data, name)
    push!(betas, [name, β̂])
end

## Filling precipitation rows by doing mean of Stations per Hour when more than one missing value, else use Ridge regression

In [None]:
include("datasets/countMissing.jl")
include("datasets/meanLine.jl")
include("datasets/replaceMissing.jl")
precipitation_df = data[:,Not(:date)][:,Not(:heure)]
for row in eachrow(precipitation_df)
    nbMissing, ind = countMissing(row)
    if(nbMissing == 1)
        row[ind] = round((convert(Vector{Float64},row[Not(ind)])'*betas[:, :β][ind]))
    end
    # remplacer les lignes qui ont de 2 a 4 missing
    if(nbMissing<5 && nbMissing>1)
        replaceMissing(row,round(meanLine(row)))
    end
end
precipitation_df.heure = data[:,:heure]
precipitation_df.date = data[:,:date]
precipitation_df = dropmissing(precipitation_df) # drop all missing
CSV.write("data/new_datasets/precipitation_filed_mean_per_hour.csv",precipitation_df)
first(precipitation_df,10)

# Daily sum as 5 Explicative Variables

In [None]:
precipitation_daily_sum = by(precipitation_df, :date,  McTavish = :McTavish=>sum, Bellevue = :Bellevue=>sum, 
   Assomption = :Assomption=>sum, Trudeau = :Trudeau=>sum, StHubert = :StHubert=>sum)
last(precipitation_daily_sum ,10)

filter out 2019 year fore prediction

In [None]:
precipitation_daily_sum_train = filter(row -> Year(row[:date]) != Year(2019), precipitation_daily_sum)
precipitation_daily_sum_pred  = filter(row -> Year(row[:date]) == Year(2019), precipitation_daily_sum)

send to csv

In [None]:
filter!(row -> row.date in surverse_df[!, :date], precipitation_daily_sum_train)
filter!(row -> row.date in precipitation_daily_sum_train[!, :date], surverse_df)

In [None]:
CSV.write("data/new_datasets/precipitation_daily_sum/x_train.csv", precipitation_daily_sum_train)
CSV.write("data/new_datasets/precipitation_daily_sum/x_pred.csv", precipitation_daily_sum_pred)

# Daily Maximum as 5 Explicative Variables

#### Extraction du taux horaire journalier maximum des précipitations pour chacune des stations météorologiques

In [None]:
precipitation_daily_max = by(precipitation_df, :date,  McTavish = :McTavish=>maximum, Bellevue = :Bellevue=>maximum, 
   Assomption = :Assomption=>maximum, Trudeau = :Trudeau=>maximum, StHubert = :StHubert=>maximum)
first(precipitation_daily_max,10)

filter out 2019 year for prediction

In [None]:
precipitation_daily_max_train = filter(row -> Year(row[:date]) != Year(2019), precipitation_daily_max)
precipitation_daily_max_pred  = filter(row -> Year(row[:date]) == Year(2019), precipitation_daily_max)

send to csv

ensure that dates fit for y and x

In [None]:
filter!(row -> row.date in surverse_df[!, :date], precipitation_daily_max_train)
filter!(row -> row.date in precipitation_daily_max_train[!, :date], surverse_df)

In [None]:
CSV.write("./data/new_datasets/precipitation_daily_max/x_train.csv", precipitation_daily_max_train)
CSV.write("./data/new_datasets/precipitation_daily_max/x_pred.csv", precipitation_daily_max_pred)

In [None]:
CSV.write("./data/new_datasets/surverse_list.csv", surverse_df)

# Somme maximale sur une division de journée

In [None]:
filledPrec  = CSV.read("data/new_datasets/precipitation_filed_mean_per_hour.csv", missingstring="-99999")

first(filledPrec,5)

In [None]:
size(filledPrec,1)

In [None]:
function dayPrecipitationSplit(window,precipitations)
    n = size(precipitations,1)
    newDf = DataFrame(McTavish = Int64[], Bellevue = Int64[], Assomption = Int64[], Trudeau  = Int64[],
                        StHubert = Int64[], heureDebut = Int64[], heureFin = Int64[], date = Date[])

    hourgroups = 24/window
    for day in groupby(filledPrec, :date)
        start = 1
        finish = window
        date = day[1,:date]
        if(size(day, 1)==24)
            for i=1:hourgroups
                mcTavish, Bellevue, Assomption, Trudeau, StHubert, heureDebut, heureFin = 0,0,0,0,0,0,0
                for j=start:finish
                    mcTavish += day[j,:McTavish]
                    Assomption += day[j,:Assomption]
                    Bellevue += day[j,:Bellevue]
                    Trudeau += day[j,:Trudeau]
                    StHubert += day[j,:StHubert]
                    if j == start
                        heureDebut = day[j,:heure]
                    elseif j == finish
                        heureFin = day[j,:heure]
                    end
                end
                start += window
                finish += window
                push!(newDf,[mcTavish,Bellevue,Assomption,Trudeau,StHubert,heureDebut,heureFin,date])
            end
        end
    end
    return newDf
end

In [None]:
hourSplit2 = dayPrecipitationSplit(2,filledPrec)
hourSplit3 = dayPrecipitationSplit(3,filledPrec)
hourSplit4 = dayPrecipitationSplit(4,filledPrec)
hourSplit6 = dayPrecipitationSplit(6,filledPrec)
hourSplit8 = dayPrecipitationSplit(8,filledPrec)
hourSplit12 = dayPrecipitationSplit(12,filledPrec)
first(hourSplit12,10)

In [None]:
function maxPrecByDay(Prec)
    n = size(Prec,1)
    newDf = DataFrame(McTavish = Int64[], Bellevue = Int64[], Assomption = Int64[], Trudeau  = Int64[],
                        StHubert = Int64[], date = Date[])

    for day in groupby(Prec, :date)
        mcTavish = maximum(day[:,:McTavish])
        Assomption = maximum(day[:,:Assomption])
        Bellevue = maximum(day[:,:Bellevue])
        Trudeau = maximum(day[:,:Trudeau])
        StHubert = maximum(day[:,:StHubert])
        date = day[1,:date]
        push!(newDf,[mcTavish,Bellevue,Assomption,Trudeau,StHubert,date])
    end
    return newDf
end

In [None]:
maxSum2hours = maxPrecByDay(hourSplit2)
maxSum3hours = maxPrecByDay(hourSplit3)
maxSum4hours = maxPrecByDay(hourSplit4)
maxSum6hours = maxPrecByDay(hourSplit6)
maxSum8hours = maxPrecByDay(hourSplit8)
maxSum12hours = maxPrecByDay(hourSplit12)
first(maxSum12hours,10)

In [None]:
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy2hours.csv",maxSum2hours)
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy3hours.csv",maxSum3hours)
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy4hours.csv",maxSum4hours)
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy6hours.csv",maxSum6hours)
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy8hours.csv",maxSum8hours)
CSV.write("./data/new_datasets/max_precipitation_day_split/maxPrecBy12hours.csv",maxSum12hours)

## Somme de la journee + les deux dernieres heures du jour d'avant

In [None]:
filledPrec  = CSV.read("data/new_datasets/precipitation_filed_mean_per_hour.csv", missingstring="-99999")

first(filledPrec,5)

In [None]:
for i=1:size(precipitation_daily_sum, 1)
    ind = findfirst(filledPrec[:,:date] .== precipitation_daily_sum[i,:date])
    for h=1:2
        for key in names(precipitation_daily_sum[:, Not(:date)])
            if ind-h > 0
                precipitation_daily_sum[i, key] += filledPrec[ind-h, key]
            end
        end
    end
end

In [None]:
first(precipitation_daily_sum, 10)

In [None]:
CSV.write("./data/new_datasets/sum_day_last_2.csv",precipitation_daily_sum)