In [6]:
using DataFrames
using CSV
using Parquet2
using Statistics
using Dates
using MLJ

function import_data(file::String)
    data = Parquet2.Dataset(file)
    df = DataFrame(data; copycols=false)
    return df
end

yellow_2022_01 = import_data("yellow_tripdata_2022-01.parquet")

function impute_median(df::DataFrame)
    median_cols = [:passenger_count, :airport_fee, :congestion_surcharge, :RatecodeID]
    for col in median_cols
        df[!, col] = coalesce.(df[!, col], median(skipmissing(df[!, col])))
    end
end

# function impute_median(df::DataFrame)
#     median_cols = [:passenger_count, :airport_fee, :congestion_surcharge, :RatecodeID]
#     median_passenger_count = median(skipmissing(df[!, :passenger_count]))
#     for col in median_cols
#         df[!, col] = replace.(df[!, col], missing => ] )
#     end
# end

impute_median(yellow_2022_01)



function create_date_features(df::DataFrame)
    df.pickup_year = year.(df.tpep_pickup_datetime)
    df.pickup_month = month.(df.tpep_pickup_datetime)
    df.pickup_day = day.(df.tpep_pickup_datetime)
    df.pickup_hour = hour.(df.tpep_pickup_datetime)
    df.pickup_minute = minute.(df.tpep_pickup_datetime)
    df.pickup_second = second.(df.tpep_pickup_datetime)
    df.pickup_weekday = dayname.(df.tpep_pickup_datetime)
    df.pickup_week = week.(df.tpep_pickup_datetime)
end 


create_date_features(yellow_2022_01)

function drop_cols(df::DataFrame)
    drop_cols = [:VendorID, :tpep_pickup_datetime, :tpep_dropoff_datetime]
    for col in drop_cols
        select!(df, Not(col))
    end
end

drop_cols(yellow_2022_01)

describe(yellow_2022_01)


# TODO: one-hot encode vendorid and store_and_fwd_flag




LinearRegressor = @load LinearRegressor pkg=MLJLinearModels

schema(yellow_2022_01)

display(yellow_2022_01)

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\rhys-\.julia\packages\MLJModels\gJoE1\src\loading.jl:159


import MLJLinearModels

Row,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,pickup_weekday,pickup_week
Unnamed: 0_level_1,Float64,Float64?,Float64,String?,Int64?,Int64?,Int64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Int64,String,Int64
1,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,2022,1,1,0,35,40,Saturday,52
2,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,2022,1,1,0,33,43,Saturday,52
3,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,2022,1,1,0,53,21,Saturday,52
4,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,2022,1,1,0,25,21,Saturday,52
5,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,2022,1,1,0,36,48,Saturday,52
6,1.0,10.3,1.0,N,138,161,1,33.0,3.0,0.5,13.0,6.55,0.3,56.35,2.5,0.0,2022,1,1,0,40,15,Saturday,52
7,1.0,5.07,1.0,N,233,87,1,17.0,0.5,0.5,5.2,0.0,0.3,26.0,2.5,0.0,2022,1,1,0,20,50,Saturday,52
8,1.0,2.02,1.0,N,238,152,2,9.0,0.5,0.5,0.0,0.0,0.3,12.8,2.5,0.0,2022,1,1,0,13,4,Saturday,52
9,1.0,2.71,1.0,N,166,236,1,12.0,0.5,0.5,2.25,0.0,0.3,18.05,2.5,0.0,2022,1,1,0,30,2,Saturday,52
10,1.0,0.78,1.0,N,236,141,2,5.0,0.5,0.5,0.0,0.0,0.3,8.8,2.5,0.0,2022,1,1,0,48,52,Saturday,52


 ✔


In [35]:
using DataFrames
function preprocess(df::DataFrame, predicitors::Vector)
    processed_df = df[:, predicitors]
    processed_df = coerce!(processed_df, Union{Missing, Continuous}=>Continuous, tight=true)
    return processed_df
end

function train_test_split(df::DataFrame)
    

    y, X = MLJ.unpack(processed_df, ==(Symbol("trip_distance")), colname -> true)
    train, test = partition(eachindex(y), 0.7)
    return y, X, train, test
end



function linear_regression_model()
    OLS = @load LinearRegressor pkg=GLM
    ols = OLS()
    mach =  machine(ols, X, y) |> fit!
    fitted_param = fitted_params(mach)
    reports = report(mach)
    return fitted_param, reports
end

# evaluate(LinearRegressor(), X, y, resampling=CV(nfolds=3), measure=rms)
  
@load LinearRegressor pkg=MLJLinearModels

X, y = @load_boston
mdl = LinearRegressor()
mach = machine(mdl, X, y)
fit!(mach)
params = fitted_params(mach)

print(params.coefs) # coefficient of the regression with names
print(params.intercept) # intercept


linear_regression_model (generic function with 1 method)

In [36]:
processed_df = preprocess(yellow_2022_01, [:trip_distance, :total_amount, :passenger_count])

y, X, train,test = train_test_split(processed_df)

fitted_param, reports = linear_regression_model()

import MLJGLMInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\rhys-\.julia\packages\MLJModels\gJoE1\src\loading.jl:159
┌ Info: Training machine(LinearRegressor(fit_intercept = true, …), …).
└ @ MLJBase C:\Users\rhys-\.julia\packages\MLJBase\uxwHr\src\machines.jl:492


((features = [:total_amount, :passenger_count], coef = [0.001084711977410174, -0.813943926507406], intercept = 6.473695923763009), (stderror = [0.0013635945500892913, 0.3595278532803166, 0.606619173038701], dof_residual = 2.463928e6, vcov = [1.859390097033217e-6 -4.4120167850253396e-7 -3.5035294204278783e-5; -4.4120167850253396e-7 0.12926027728435288 -0.17813177679333228; -3.5035294204278783e-5 -0.17813177679333228 0.3679868210981575], deviance = 7.395790800100994e11, coef_table = ───────────────────────────────────────────────────────────────────────────────────
                       Coef.  Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
───────────────────────────────────────────────────────────────────────────────────
total_amount      0.00108471  0.00136359   0.80    0.4263  -0.00158789   0.00375731
passenger_count  -0.813944    0.359528    -2.26    0.0236  -1.51861     -0.109282
(Intercept)       6.4737      0.606619    10.67    <1e-25   5.28474      7.66265
───────────────

(name = LinearRegressor, package_name = GLM, ... )