# Example usage

To use `aussie_car_prices` in a project:

In [1]:
from ols_regressor.regressor import LinearRegressor
from ols_regressor.cross_validate import cross_validate
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")
data.head()

Unnamed: 0,Brand,UsedOrNew,Transmission,DriveType,FuelType,BodyType,ExteriorColour,Year,Kilometres,Doors,Seats,Price,Engine_cylinder_number,Engine_total_volume,fuel_comsumption_liter,fuel_comsumption_km
0,Ssangyong,DEMO,Automatic,AWD,Diesel,SUV,White,2022,5595.0,4.0,7.0,51990.0,4.0,2.2,8.7,100.0
1,MG,USED,Automatic,Front,Premium,Hatchback,Black,2022,16.0,5.0,5.0,19990.0,4.0,1.5,6.7,100.0
2,BMW,USED,Automatic,Rear,Premium,Coupe,Grey,2022,8472.0,2.0,4.0,108988.0,4.0,2.0,6.6,100.0
3,Mercedes-Benz,USED,Automatic,Rear,Premium,Coupe,White,2011,136517.0,2.0,4.0,32990.0,8.0,5.5,11.0,100.0
4,Renault,USED,Automatic,Front,Unleaded,SUV,Grey,2022,1035.0,4.0,5.0,34990.0,4.0,1.3,6.0,100.0


In [3]:
data["fuel_comsumption_km"].value_counts()

fuel_comsumption_km
100.0    16734
Name: count, dtype: int64

In [4]:
data.columns

Index(['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour', 'Year', 'Kilometres', 'Doors', 'Seats',
       'Price', 'Engine_cylinder_number', 'Engine_total_volume',
       'fuel_comsumption_liter', 'fuel_comsumption_km'],
      dtype='object')

In [5]:
X, y = data.drop(columns=["Price"]), data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
categorical_features = ['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour']
ordinal_features = ['Doors', 'Seats', 'Engine_cylinder_number']
numeric_features = ['Year', 'Kilometres', 'Engine_total_volume', 'fuel_comsumption_liter']
drop_features = ['fuel_comsumption_liter']
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    (OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999), ordinal_features),
    (StandardScaler(), numeric_features),
    ("drop", drop_features)
)
X_train_encoded = ct.fit_transform(X_train)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

In [7]:
X_test_encoded = ct.transform(X_test)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

In [8]:
model = LinearRegressor()
model.fit(X_train_encoded, y_train)
model.score(X_train_encoded, y_train)

-387091.894703509

In [9]:
model.coef

array([ 4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  1.05664723e+22,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  1.05664723e+22,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  4.42918218e+19,  4.42918218e+19,  4.42918218e+19,
        4.42918218e+19,  

In [10]:
model = LinearRegression()
# model.fit(X_train_encoded, y_train)
# model.score(X_train_encoded, y_train)

In [11]:
y_train = y_train.to_numpy()

In [12]:
cv_results = cross_validate(model, X_train_encoded, y_train, 5, 42)
pd.DataFrame(cv_results)

Unnamed: 0,train_score,test_score,fit_time,score_time
0,0.693631,-3.291663e+18,0.066387,0.000577
1,0.708387,-7.799086e+19,0.058461,0.000544
2,0.688458,-4.924776e+17,0.054068,0.000684
3,0.746368,-2.566022e+20,0.057634,0.000543
4,0.693523,-8.215433e+18,0.067344,0.000506
5,0.697379,-19.22348,0.064658,0.000285


### Predicting with the Fitted Model
Now that our regression model has been fitted, it is time to utilize it for making predictions on unseen data. The `predict` function within the `ols_regressor` package has been designed for this purpose. This function expects an array-like matrix X of shape (n_samples, n_features) as input, so that we can compute the predicted target values with the coefficients stored in the `self.coef` attribute. The `predict` function will return an array contains the model's predictions based on the provided input features.

The use of this function is demonstrated below.

In [13]:
model.predict(X_test_encoded)

array([57856., 72576., 69120., ..., 42880., 33952., 23520.])

In [14]:
model.score(X_test_encoded, y_test)

0.40563675363320206