In [1]:
using DataFrames, DataFramesMeta
using CSV
using Statistics
using ScikitLearn

In [2]:
@sk_import linear_model: LinearRegression
@sk_import linear_model: ElasticNet

PyObject <class 'sklearn.linear_model.coordinate_descent.ElasticNet'>

In [86]:
KEY = "2018det"
COLUMN = :totalPoints

:totalPoints

In [87]:
function opr_ß(scores::DataFrame, teams::DataFrame, teamnums=nothing)
    if teamnums == nothing
        teamnums = teams.team1 |> unique |> sort
    end
    
    teamindex = Dict(t => i for (i, t) in enumerate(teamnums))

    ß = zeros(size(scores,1), size(teamnums,1))
    for p in [:team1, :team2, :team3]
        for i=1:size(teams,1)
            team = teams[i,p]
            j = teamindex[team]
            ß[i,j] = 1.0
        end
    end
    return (ß, teamnums, teamindex)
end

opr_ß (generic function with 3 methods)

In [88]:
headers = CSV.read("../data/2018_headers.csv") |> (h -> String.(names(h)))
push!(headers, "key")
push!(headers, "level")
push!(headers, "event")

scores = CSV.read("../data/matches_$KEY.csv", header=headers)
teams = CSV.read("../data/matches_teams_$KEY.csv", header=["team1","team2","team3","key","level","event"])

scores_train = @linq scores |> where(:level .== "qm")
teams_train = @linq teams |> where(:level .== "qm")

scores_test = @linq scores |> where(:level .!= "qm")
teams_test = @linq teams |> where(:level .!= "qm")

"""
teamnums = teams_train.team1 |> unique |> sort
teamindex = Dict(t => i for (i, t) in enumerate(teamnums))

ß = zeros(size(scores_train,1), size(teamnums,1))
for p in [:team1, :team2, :team3]
    for i=1:size(teams_train,1)
        team = teams_train[i,p]
        j = teamindex[team]
        ß[i,j] = 1.0
    end
end
"""

(ß, teamnums, teamindex) = opr_ß(scores_train, teams_train)
y = convert.(Float64, coalesce(scores_train[COLUMN], -1.0))

1356-element Array{Float64,1}:
 292.0
 425.0
 403.0
 197.0
 317.0
 369.0
 386.0
 243.0
 324.0
 437.0
 296.0
 442.0
 404.0
   ⋮  
 327.0
 370.0
 442.0
 283.0
 400.0
 313.0
 382.0
 276.0
 450.0
 267.0
 405.0
 218.0

In [89]:
model1 = ScikitLearn.fit!(LinearRegression(fit_intercept=false), ß, y)

PyObject LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [90]:
oprs = DataFrame(hcat(teamnums, model1[:coef_]), [:team, :rating])
#sort(oprs, (:rating), rev = true) |> print

Unnamed: 0_level_0,team,rating
Unnamed: 0_level_1,Any,Any
1,frc1018,132.53
2,frc1023,138.615
3,frc1024,140.688
4,frc1025,123.633
5,frc103,116.57
6,frc1071,115.18
7,frc1073,100.884
8,frc1076,92.5134
9,frc1100,121.227
10,frc111,74.5448


In [128]:
model2 = ScikitLearn.fit!(ElasticNet(alpha=0, l1_ratio=0, fit_intercept=true), ß, y)
oprs2 = DataFrame(hcat(teamnums, model2[:coef_]), [:team, :rating])
#sort(oprs2, (:rating), rev = true) |> print

Unnamed: 0_level_0,team,rating
Unnamed: 0_level_1,Any,Any
1,frc1018,19.9167
2,frc1023,26.0015
3,frc1024,28.0752
4,frc1025,11.02
5,frc103,3.9572
6,frc1071,2.56662
7,frc1073,-11.7296
8,frc1076,-20.0998
9,frc1100,8.61339
10,frc111,-38.0684


In [129]:
(ßtest, _, _) = opr_ß(scores_test, teams_test, teamnums)

([0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], Union{Missing, String}["frc1018", "frc1023", "frc1024", "frc1025", "frc103", "frc1071", "frc1073", "frc1076", "frc1100", "frc111"  …  "frc857", "frc862", "frc865", "frc868", "frc870", "frc88", "frc888", "frc894", "frc910", "frc977"], Dict("frc888"=>402,"frc1816"=>60,"frc2549"=>113,"frc1071"=>6,"frc6964"=>371,"frc2604"=>117,"frc291"=>139,"frc7048"=>376,"frc5339"=>289,"frc555"=>307…))

In [130]:
test_y = convert.(Float64, coalesce(scores_test[COLUMN], -1.0))
test_yhat_m1 = ScikitLearn.predict(model1, ßtest)
test_yhat_m2 = ScikitLearn.predict(model2, ßtest)

200-element Array{Float64,1}:
 356.4634845230824 
 469.8945103790024 
 356.4634845230824 
 469.8945103790024 
 478.4038875502631 
 396.2738478142097 
 445.924253341976  
 396.2738478142097 
 445.924253341976  
 396.2738478142097 
 356.4634845230824 
 413.23055699752   
 356.4634845230824 
   ⋮               
 466.47721702579304
 449.008953579127  
 466.47721702579304
 449.008953579127  
 466.66331521952486
 386.894678606519  
 466.66331521952486
 386.894678606519  
 434.2987333174266 
 466.47721702579304
 488.1537770916406 
 466.47721702579304

In [131]:
predictions_table = DataFrame(hcat(scores_test.key, test_y, test_yhat_m1, test_yhat_m2), [:key, :y, :LinReg, :RidgeReg])

Unnamed: 0_level_0,key,y,LinReg,RidgeReg
Unnamed: 0_level_1,Any,Any,Any,Any
1,2018arc_f1m1_red,356.0,356.463,356.463
2,2018arc_f1m1_blue,361.0,469.895,469.895
3,2018arc_f1m2_red,356.0,356.463,356.463
4,2018arc_f1m2_blue,397.0,469.895,469.895
5,2018arc_qf1m1_red,270.0,478.404,478.404
6,2018arc_qf1m1_blue,317.0,396.274,396.274
7,2018arc_qf1m2_red,455.0,445.924,445.924
8,2018arc_qf1m2_blue,303.0,396.274,396.274
9,2018arc_qf1m3_red,443.0,445.924,445.924
10,2018arc_qf1m3_blue,272.0,396.274,396.274


In [132]:
rmse_1 = mean((predictions_table.LinReg-predictions_table.y).^2) |> sqrt

128.15965838569656

In [133]:
rmse_2 = mean((predictions_table.RidgeReg-predictions_table.y).^2) |> sqrt

128.15965838569633