# Example usage

To use `aussie_car_prices` in a project:

In [33]:
from ols_regressor.regressor import LinearRegressor
from ols_regressor.cross_validate import cross_validate
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [23]:
data = pd.read_csv("../data/preprocessed_data.csv")
data.head()

Unnamed: 0,Brand,UsedOrNew,Transmission,DriveType,FuelType,BodyType,ExteriorColour,Year,Kilometres,Doors,Seats,Price,Engine_cylinder_number,Engine_total_volume,fuel_comsumption_liter,fuel_comsumption_km
0,Ssangyong,DEMO,Automatic,AWD,Diesel,SUV,White,2022,5595.0,4.0,7.0,51990.0,4.0,2.2,8.7,100.0
1,MG,USED,Automatic,Front,Premium,Hatchback,Black,2022,16.0,5.0,5.0,19990.0,4.0,1.5,6.7,100.0
2,BMW,USED,Automatic,Rear,Premium,Coupe,Grey,2022,8472.0,2.0,4.0,108988.0,4.0,2.0,6.6,100.0
3,Mercedes-Benz,USED,Automatic,Rear,Premium,Coupe,White,2011,136517.0,2.0,4.0,32990.0,8.0,5.5,11.0,100.0
4,Renault,USED,Automatic,Front,Unleaded,SUV,Grey,2022,1035.0,4.0,5.0,34990.0,4.0,1.3,6.0,100.0


In [24]:
data["fuel_comsumption_km"].value_counts()

fuel_comsumption_km
100.0    16734
Name: count, dtype: int64

In [25]:
data.columns

Index(['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour', 'Year', 'Kilometres', 'Doors', 'Seats',
       'Price', 'Engine_cylinder_number', 'Engine_total_volume',
       'fuel_comsumption_liter', 'fuel_comsumption_km'],
      dtype='object')

In [26]:
X, y = data.drop(columns=["Price"]), data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
categorical_features = ['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour']
ordinal_features = ['Doors', 'Seats', 'Engine_cylinder_number']
numeric_features = ['Year', 'Kilometres', 'Engine_total_volume', 'fuel_comsumption_liter']
drop_features = ['fuel_comsumption_liter']
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    (OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999), ordinal_features),
    (StandardScaler(), numeric_features),
    ("drop", drop_features)
)
X_train_encoded = ct.fit_transform(X_train)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

In [28]:
X_test_encoded = ct.transform(X_test)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

In [55]:
model = LinearRegressor()
model.fit(X_train_encoded, y_train)
model.score(X_train_encoded, y_train)

-2916815.061663288

In [36]:
model.coef

array([ 2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  1.08775639e+22,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  1.08775639e+22,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  2.54024347e+20,  2.54024347e+20,  2.54024347e+20,
        2.54024347e+20,  

In [53]:
model = LinearRegression()
# model.fit(X_train_encoded, y_train)
# model.score(X_train_encoded, y_train)

In [47]:
y_train = y_train.to_numpy()

In [54]:
cv_results = cross_validate(model, X_train_encoded, y_train, 5, 42)
pd.DataFrame(cv_results)

Unnamed: 0,train_score,test_score,fit_time,score_time
0,0.693632,-6.314807e+18,0.073595,0.000639
1,0.708397,-9.959198e+19,0.130337,0.000712
2,0.688486,-5.523258e+18,0.138305,0.000667
3,0.746332,-8.100743e+19,0.12569,0.000623
4,0.693517,-9.473059e+18,0.06796,0.000566
5,0.69743,-20.12066,0.080571,0.000344


In [56]:
model.predict(X_test_encoded)

array([ 868336.4691848 ,  966667.47620837, -106385.49803855, ...,
        737210.45513767,  597910.44577291, -507973.54486233])

In [57]:
model.score(X_test_encoded, y_test)

-1.0529366934775053e+28