In [1]:
using DataFrames, DataFramesMeta
using CSV
using Statistics
using ScikitLearn

In [2]:
@sk_import linear_model: LinearRegression
@sk_import linear_model: ElasticNet

PyObject <class 'sklearn.linear_model.coordinate_descent.ElasticNet'>

In [24]:
KEY = "2018chs"
COLUMN = :totalPoints

:totalPoints

In [25]:
function opr_ß(scores::DataFrame, teams::DataFrame, teamnums=nothing)
    if teamnums == nothing
        teamnums = teams.team1 |> unique |> sort
    end
    
    teamindex = Dict(t => i for (i, t) in enumerate(teamnums))

    ß = zeros(size(scores,1), size(teamnums,1))
    for p in [:team1, :team2, :team3]
        for i=1:size(teams,1)
            team = teams[i,p]
            j = teamindex[team]
            ß[i,j] = 1.0
        end
    end
    return (ß, teamnums, teamindex)
end

opr_ß (generic function with 3 methods)

In [26]:
headers = CSV.read("../data/2018_headers.csv") |> (h -> String.(names(h)))
push!(headers, "key")
push!(headers, "level")
push!(headers, "event")

scores = CSV.read("../data/matches_$KEY.csv", header=headers)
teams = CSV.read("../data/matches_teams_$KEY.csv", header=["team1","team2","team3","key","level","event"])

scores_train = @linq scores |> where(:level .== "qm")
teams_train = @linq teams |> where(:level .== "qm")

scores_test = @linq scores |> where(:level .!= "qm")
teams_test = @linq teams |> where(:level .!= "qm")

"""
teamnums = teams_train.team1 |> unique |> sort
teamindex = Dict(t => i for (i, t) in enumerate(teamnums))

ß = zeros(size(scores_train,1), size(teamnums,1))
for p in [:team1, :team2, :team3]
    for i=1:size(teams_train,1)
        team = teams_train[i,p]
        j = teamindex[team]
        ß[i,j] = 1.0
    end
end
"""

(ß, teamnums, teamindex) = opr_ß(scores_train, teams_train)
y = convert.(Float64, coalesce(scores_train[COLUMN], -1.0))

1260-element Array{Float64,1}:
 427.0
 234.0
  95.0
 386.0
 215.0
 322.0
 167.0
 445.0
 140.0
 340.0
 327.0
 155.0
 379.0
   ⋮  
 435.0
 240.0
 366.0
 304.0
 421.0
 213.0
 186.0
 330.0
 302.0
 311.0
 194.0
 246.0

In [27]:
model1 = ScikitLearn.fit!(LinearRegression(fit_intercept=false), ß, y)

PyObject LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [28]:
oprs = DataFrame(hcat(teamnums, model1[:coef_]), [:team, :rating])
#sort(oprs, (:rating), rev = true) |> print

Unnamed: 0_level_0,team,rating
Unnamed: 0_level_1,Any,Any
1,frc1080,109.168
2,frc1086,114.351
3,frc1111,89.9161
4,frc1123,56.7184
5,frc1137,57.2592
6,frc116,87.0477
7,frc122,115.127
8,frc1262,156.803
9,frc1389,74.9499
10,frc141,67.9638


In [29]:
model2 = ScikitLearn.fit!(ElasticNet(alpha=0.25, l1_ratio=0.1, fit_intercept=true), ß, y)
oprs2 = DataFrame(hcat(teamnums, model2[:coef_]), [:team, :rating])
#sort(oprs2, (:rating), rev = true) |> print

Unnamed: 0_level_0,team,rating
Unnamed: 0_level_1,Any,Any
1,frc1080,0.0124507
2,frc1086,2.9221
3,frc1111,0.134462
4,frc1123,-2.25417
5,frc1137,-3.70896
6,frc116,0.577415
7,frc122,2.10781
8,frc1262,8.08552
9,frc1389,-2.74105
10,frc141,-0.707339


In [30]:
(ßtest, _, _) = opr_ß(scores_test, teams_test, teamnums)

([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 1.0; 0.0 0.0 … 0.0 0.0], Union{Missing, String}["frc1080", "frc1086", "frc1111", "frc1123", "frc1137", "frc116", "frc122", "frc1262", "frc1389", "frc141"  …  "frc6584", "frc6802", "frc686", "frc6863", "frc6882", "frc6893", "frc7330", "frc836", "frc888", "frc977"], Dict("frc888"=>129,"frc1086"=>2,"frc2421"=>35,"frc3359"=>53,"frc3941"=>66,"frc5724"=>95,"frc5546"=>92,"frc4242"=>71,"frc4514"=>80,"frc1418"=>12…))

In [31]:
test_y = convert.(Float64, coalesce(scores_test[COLUMN], -1.0))
test_yhat_m1 = ScikitLearn.predict(model1, ßtest)
test_yhat_m2 = ScikitLearn.predict(model2, ßtest)

250-element Array{Float64,1}:
 297.5437862481179 
 290.7305511661394 
 297.5437862481179 
 290.7305511661394 
 305.61120961181933
 297.5437862481179 
 305.61120961181933
 297.5437862481179 
 298.3195540768279 
 295.8307444778829 
 298.3195540768279 
 295.8307444778829 
 296.14557940004784
   ⋮               
 285.1451784953888 
 274.4095206583749 
 287.18857905954997
 284.59310367461785
 287.18857905954997
 284.59310367461785
 287.18857905954997
 284.59310367461785
 290.5355520132392 
 285.1451784953888 
 290.5355520132392 
 285.1451784953888 

In [32]:
predictions_table = DataFrame(hcat(scores_test.key, test_y, test_yhat_m1, test_yhat_m2), [:key, :y, :LinReg, :RidgeReg])

Unnamed: 0_level_0,key,y,LinReg,RidgeReg
Unnamed: 0_level_1,Any,Any,Any,Any
1,2018chcmp_f1m1_red,436.0,396.524,297.544
2,2018chcmp_f1m1_blue,221.0,358.901,290.731
3,2018chcmp_f1m2_red,423.0,396.524,297.544
4,2018chcmp_f1m2_blue,202.0,358.901,290.731
5,2018chcmp_qf1m1_red,335.0,470.841,305.611
6,2018chcmp_qf1m1_blue,342.0,396.524,297.544
7,2018chcmp_qf1m2_red,275.0,470.841,305.611
8,2018chcmp_qf1m2_blue,411.0,396.524,297.544
9,2018chcmp_qf2m1_red,466.0,412.58,298.32
10,2018chcmp_qf2m1_blue,305.0,389.188,295.831


In [33]:
rmse_1 = mean((predictions_table.LinReg-predictions_table.y).^2) |> sqrt

86.8726166924313

In [34]:
rmse_2 = mean((predictions_table.RidgeReg-predictions_table.y).^2) |> sqrt

89.46248080737458