# Example usage

To use `ols_regressor` in a project:

In [1]:
from ols_regressor.regressor import LinearRegressor
from ols_regressor.cross_validate import cross_validate
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")
data.head()

Unnamed: 0,Brand,UsedOrNew,Transmission,DriveType,FuelType,BodyType,ExteriorColour,Year,Kilometres,Doors,Seats,Price,Engine_cylinder_number,Engine_total_volume,fuel_comsumption_liter,fuel_comsumption_km
0,Ssangyong,DEMO,Automatic,AWD,Diesel,SUV,White,2022,5595.0,4.0,7.0,51990.0,4.0,2.2,8.7,100.0
1,MG,USED,Automatic,Front,Premium,Hatchback,Black,2022,16.0,5.0,5.0,19990.0,4.0,1.5,6.7,100.0
2,BMW,USED,Automatic,Rear,Premium,Coupe,Grey,2022,8472.0,2.0,4.0,108988.0,4.0,2.0,6.6,100.0
3,Mercedes-Benz,USED,Automatic,Rear,Premium,Coupe,White,2011,136517.0,2.0,4.0,32990.0,8.0,5.5,11.0,100.0
4,Renault,USED,Automatic,Front,Unleaded,SUV,Grey,2022,1035.0,4.0,5.0,34990.0,4.0,1.3,6.0,100.0


In [3]:
data["fuel_comsumption_km"].value_counts()

fuel_comsumption_km
100.0    16734
Name: count, dtype: int64

In [4]:
data.columns

Index(['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour', 'Year', 'Kilometres', 'Doors', 'Seats',
       'Price', 'Engine_cylinder_number', 'Engine_total_volume',
       'fuel_comsumption_liter', 'fuel_comsumption_km'],
      dtype='object')

In [5]:
X, y = data.drop(columns=["Price"]), data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
categorical_features = ['Brand', 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType',
       'BodyType', 'ExteriorColour']
ordinal_features = ['Doors', 'Seats', 'Engine_cylinder_number']
numeric_features = ['Year', 'Kilometres', 'Engine_total_volume', 'fuel_comsumption_liter']
drop_features = ['fuel_comsumption_liter']
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
    (OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999), ordinal_features),
    (StandardScaler(), numeric_features),
    ("drop", drop_features)
)
X_train_encoded = ct.fit_transform(X_train)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

In [7]:
X_test_encoded = ct.transform(X_test)
X_train_encoded

array([[ 0.        ,  0.        ,  0.        , ...,  1.49185096,
         1.39327292,  1.50962043],
       [ 0.        ,  0.        ,  0.        , ..., -0.4465423 ,
        -0.69119385, -0.71342509],
       [ 0.        ,  0.        ,  0.        , ..., -0.51867241,
        -1.03860498, -1.75689544],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.00518898,
        -0.45958643, -0.62268854],
       [ 0.        ,  0.        ,  0.        , ...,  0.65774406,
        -0.45958643, -0.84952992],
       [ 0.        ,  0.        ,  0.        , ..., -0.22674326,
        -1.27021239,  0.37541353]])

### Fitting the model on the training data
The `fit` function in the `ols_regressor` package will calculate the coefficients for the linear regression model using the Ordinary Least Squares (OLS) method. It converts the input features and target values into NumPy arrays. The function then augments the feature matrix with an intercept term and computes the model coefficients using the OLS formula. The resulting coefficients are stored in the `self.coef` attribute, representing the weights that minimize the sum of squared differences between the predicted and actual target values.

The use of this function is demonstrated below.




In [8]:
model = LinearRegressor()
model.fit(X_train_encoded, y_train)

array([ 3.74884737e+04, -6.49553047e+01,  6.70989310e+01,  3.91987703e+03,
        1.68603441e+03,  2.07425208e+03, -4.35179052e+02,  5.36203410e+03,
       -6.73988962e+02, -5.09200008e+02,  2.00733082e+03, -5.14600919e+02,
       -2.91049832e+02,  2.32108854e+02,  1.73765226e+02, -7.10792758e+02,
        4.30511293e+02, -3.70337733e+02,  2.94562712e+02,  1.14678396e+04,
       -3.66827754e+02, -4.88255149e+02, -4.02263542e+02, -1.94323863e+03,
       -3.50529029e+02,  8.27555155e+02, -6.12315985e+02,  1.07995950e+03,
       -6.99965560e+02, -1.61958248e+03, -2.04318142e+03,  1.27363552e+02,
        4.88562079e+02, -9.28950153e+02,  3.98818342e+02, -1.88385867e+02,
        2.41400768e+03, -4.26962716e+02, -2.37074030e+03,  8.31800638e+02,
       -7.85142068e+02, -1.05629631e+03, -1.68746127e+03,  3.41748712e+03,
        3.43620599e+03,  1.02871783e+03,  2.41771001e+02, -2.04943635e+03,
       -6.57061988e+02,  2.24501264e+03, -1.38853575e+03,  8.29336225e+03,
        3.70765595e+03,  

In [9]:
# model = LinearRegression()
# model.fit(X_train_encoded, y_train)
# model.score(X_train_encoded, y_train)

0.6974914521523434

In [None]:
# y_train = y_train.to_numpy()

In [None]:
cv_results = cross_validate(model, X_train_encoded, y_train, 5, 42)
pd.DataFrame(cv_results)

In [None]:
model.predict(X_test_encoded)

In [None]:
model.score(X_test_encoded, y_test)