In [None]:
using CSV, DataFrames, Distributions, Dates, Gadfly, GLM, Statistics, Random, Plots

In [None]:
function rmsle(predictions::Vector{Int64}, actual::Vector{Int64})
    if length(predictions) != length(actual)
        throw(ArgumentError("Les vecteurs de prédictions et de valeurs réelles doivent avoir la même longueur"))
    end
    
    n = length(predictions)
    sum_squared_log_errors = 0
    
    for i in 1:n
        sum_squared_log_errors += (log(predictions[i] + 1) - log(actual[i] + 1))^2
    end
    
    rmsle_score = sqrt(sum_squared_log_errors / n)
    return rmsle_score
end

In [None]:
data = CSV.read("./Data/train.csv", DataFrame);

In [None]:
data.Puissance_Moteur = ifelse.(data.Type .== "unpowered", 1, data.Puissance_Moteur);

## Modele pour la puissance

In [None]:
data_puissance = dropmissing(data, :Puissance_Moteur);

In [None]:
data_puissance = filter(row -> row.Longueur > 10, data_puissance);

In [None]:
model_puissance = lm(@formula(Puissance_Moteur ~ Longueur + Type), data_puissance)

## Modele pour le poids

In [None]:
data_poids = dropmissing(data, :Poids);

In [None]:
data_poids = filter(row -> row.Longueur < 100, data_poids);

In [None]:
data_poids.LongueurSquare = data_poids.Longueur.^2;

In [None]:
data_poids.LongueurCube = data_poids.Longueur.^3;

In [None]:
model_poids = lm(@formula(Poids ~ Longueur + LongueurSquare + LongueurCube), data_poids)

## Modele general

In [None]:
replaceValue = "Autre"
data[!, :Modèle] = coalesce.(data[!, :Modèle], replaceValue);

In [None]:
replaceValue = "Autre"
data[!, :Classe] = coalesce.(data[!, :Classe], replaceValue);

In [None]:
pred_puissance = predict(model_puissance, data);
n = length(pred_puissance)
for i in 1:n
    if (ismissing(data.Puissance_Moteur[i]))
        data.Puissance_Moteur[i] = Int(round(pred_puissance[i], digits=0))
    end
end

In [None]:
data.LongueurSquare = data.Longueur.^2;
data.LongueurCube = data.Longueur.^3;
pred_poids = predict(model_poids, data);
n = length(pred_poids)
for i in 1:n
    if (ismissing(data.Poids[i]))
        data.Poids[i] = Int(round(pred_poids[i], digits=0))
    end
end

In [None]:
#filtered_df = filter(row -> row.Prix < 100000, data);
#df_high = filter(row -> row.Prix > 100000, data);
Random.seed!(3302)
train_id = sample(1:nrow(data), round(Int, .8nrow(data)), ordered=true, replace=false)
valid_id = setdiff(1:nrow(data), train_id)

#valid_high_id = sample(1:nrow(df_high), round(Int, .2nrow(df_high)), ordered=true, replace=false)
#valid_high = filtered_df[valid_high_id,:];

train = data[train_id,:]

valid = data[valid_id,:];
#valid = vcat(valid, valid_high);

In [None]:
regex_pattern = r"\D+"
function extract_model_name(model)
    match_result = match(regex_pattern, replace(lowercase(string(model)), r"\s+" => ""))
    return match_result === nothing ? "other" : match_result.match
end
# Apply the function to create a new column 'Modèle_New' in the train DataFrame
train[!, :Simplified_Modèle] = coalesce.(extract_model_name.(train[!, :Modèle]), "Other")
valid[!, :Simplified_Modèle] = coalesce.(extract_model_name.(valid[!, :Modèle]), "Other")
#first(train, 1)

In [None]:
function age_group(age)
    if age >= 2020
        return "abcd"
    elseif age >= 2019
        return "efgh"
    elseif age >= 2014
        return "ijkl"
    elseif age >= 2005
        return "mnop"
    else
        return "qrst"
    end
end

train[!, :Age_cat] = age_group.(train[!, :Année])
valid[!, :Age_cat] = age_group.(valid[!, :Année]);


In [None]:
# Histogram of the 'Age' column
Gadfly.plot(train, x=:Age_cat, Geom.histogram)

In [None]:
# same thins as Age_cat but for the Longueur column
function longueur_group(longueur)
    if longueur >=27 
        return "zyxw"
    elseif longueur >= 23
        return "vuts"
    elseif longueur >= 20
        return "rqpo"
    elseif longueur >= 18
        return "nmlk"
    else
        return "jihg"
    end
end

train[!, :Longueur_cat] = longueur_group.(train[!, :Longueur])
valid[!, :Longueur_cat] = longueur_group.(valid[!, :Longueur]);

In [None]:
Gadfly.plot(train, x=:Longueur_cat, Geom.histogram)

In [None]:
# same as age_cat but for Puissance
function power_group(power)
    if power >= 300
        return "abcd"
    elseif power >= 175
        return "efgh"
    elseif power >= 100
        return "ijkl"
    else
        return "mnop"
    end
end

train[!, :power_cat] = power_group.(train[!, :Puissance_Moteur])
valid[!, :power_cat] = power_group.(valid[!, :Puissance_Moteur]);    
    

In [None]:
Gadfly.plot(train, x=:power_cat, Geom.histogram)

## TEST

In [None]:
names(train)

In [None]:
unique_data_modèles = unique(train[!, :Simplified_Modèle])
unique_valid_modèles = unique(valid[!, :Simplified_Modèle])
println("Number of unique modèle in train: ", length(unique_data_modèles))
println("Number of unique modèle in valid: ", length(unique_valid_modèles))
diff_modèles = setdiff(unique_valid_modèles, unique_data_modèles)
train[!, :Fab_Model_Longueur] = train[!, :Fabricant] .* "_" .* train[!, :Simplified_Modèle] .* "_" .*  train[!, :Condition] .* "_" .* string.(train[!, :Longueur]) .* "_" .* string.(train[!, :Poids]) .* "_" .*  string.(train[!, :Année]) .* "_" .* string.(train[!, :Puissance_Moteur]) .* "_" .* string.(train[!, :Type]);
valid[!, :Fab_Model_Longueur] = valid[!, :Fabricant] .* "_" .* valid[!, :Simplified_Modèle] .* "_" .*  valid[!, :Condition] .* "_" .* string.(valid[!, :Longueur]) .* "_" .* string.(valid[!, :Poids]) .* "_" .*  string.(valid[!, :Année]) .* "_" .* string.(valid[!, :Puissance_Moteur]) .* "_" .* string.(valid[!, :Type]);

unique_data_modèles = unique(train[!, :Fab_Model_Longueur])
unique_valid_modèles = unique(valid[!, :Fab_Model_Longueur]);

In [None]:
using StringDistances

threshold = 0.2
mapping = Dict{String, String}()

for modèle_valid in unique_valid_modèles
    best_match = ""
    best_similarity = 0.0
    
    for modèle_train in unique_data_modèles
        similarity = compare(modèle_valid, modèle_train, StringDistances.Levenshtein())
        
        if similarity > threshold && similarity > best_similarity
            best_match = modèle_train
            best_similarity = similarity
        end
    end
    
    if best_match != ""
        mapping[modèle_valid] = best_match
    end
end

In [None]:
replace_with_mapping(value) = get(mapping, value, value)
valid[!, :Fab_Model_Longueur] .= replace_with_mapping.(valid[!, :Fab_Model_Longueur]);
first(valid, 1)

In [None]:
model = lm(@formula(Prix ~ ((Fab_Model_Longueur))), train);

In [None]:
valid_prediction = predict(model, valid)
mean_prediction = mean(valid_prediction)
valid_prediction = coalesce.(valid_prediction, mean_prediction)
v = Int.(round.(valid_prediction, digits=0))
v = ifelse.(v.< 0, 0, v)
score = rmsle(v, valid.Prix)

orgiginal Alex : 0.49676563597360485

sans année : 0.5795805586630924

sans année + Age_cat + Longueur_cat : 0.49092335331666376