### Importing libraries

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

## Data Preparation

### Original state of data

In [26]:
#Training data
data_train = pd.read_csv("data/sales_ads_train.csv")
print("Unprepared Training dataset: \n")
data_train.info()

#Testing data
data_test = pd.read_csv("data/sales_ads_test.csv")
print("Unprepared Testing dataset: \n")
data_test.info()
test_id = data_test["ID"].copy()

# Combined data
data = pd.concat([data_train, data_test])
print("Combined dataset: \n")
data.info()

Unprepared Training dataset: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135397 entries, 0 to 135396
Data columns (total 25 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   ID                          135397 non-null  int64  
 1   Cena                        135397 non-null  int64  
 2   Waluta                      132021 non-null  object 
 3   Stan                        132075 non-null  object 
 4   Marka_pojazdu               132046 non-null  object 
 5   Model_pojazdu               132088 non-null  object 
 6   Wersja_pojazdu              87336 non-null   object 
 7   Generacja_pojazdu           93737 non-null   object 
 8   Rok_produkcji               131990 non-null  float64
 9   Przebieg_km                 131394 non-null  float64
 10  Moc_KM                      131664 non-null  float64
 11  Pojemnosc_cm3               130711 non-null  float64
 12  Rodzaj_paliwa               131987 non-nu

In [29]:
# Checking the number of unique Values
print("Number of unique values: \n")
nunique = data.select_dtypes(include=['object']).nunique()
print(nunique)

Number of unique values: 

Waluta                             2
Stan                               2
Marka_pojazdu                    108
Model_pojazdu                   1201
Wersja_pojazdu                 18938
Generacja_pojazdu                569
Rodzaj_paliwa                      8
Naped                              5
Skrzynia_biegow                    2
Typ_nadwozia                       9
Kolor                             14
Kraj_pochodzenia                  37
Pierwszy_wlasciciel                1
Data_pierwszej_rejestracji      8408
Data_publikacji_oferty            41
Lokalizacja_oferty             13574
Wyposazenie                   174524
dtype: int64


### Data Preparation method

In [3]:
def data_preparation(data):
    data_prepared = data.copy()
    
    # Droping the ID column
    data_prepared = data_prepared.drop("ID", axis = 1)
    
    # Changing "Pierwszy_wlasciciel" type to bool
    data_prepared["Pierwszy_wlasciciel"] = data_prepared["Pierwszy_wlasciciel"].fillna(0)
    with pd.option_context('future.no_silent_downcasting', True):
        data_prepared["Pierwszy_wlasciciel"] = data_prepared["Pierwszy_wlasciciel"].replace('Yes', 1)
    data_prepared["Pierwszy_wlasciciel"] = data_prepared["Pierwszy_wlasciciel"].astype('bool')

    # Extracting year when car was registered and when the offer was posted
    data_prepared["Rok_rejestracji"] = data_prepared["Data_pierwszej_rejestracji"].str.slice(-4)
    data_prepared["Rok_oferty"] = data_prepared["Data_publikacji_oferty"].str.slice(-4)
    
    # Droping categorical columns that are too varied
    data_prepared = data_prepared.drop(['Model_pojazdu',
       'Wersja_pojazdu', 'Generacja_pojazdu', 'Data_pierwszej_rejestracji',
       'Data_publikacji_oferty', 'Lokalizacja_oferty', 'Wyposazenie'], axis=1)

    # Droping columns that seem to not affect the price of a car
    data_prepared = data_prepared.drop('Kolor', axis=1)
    
    # Droping columns with too many missing values
    data_prepared = data_prepared.drop('Emisja_CO2', axis=1)
    
    # Changing categorical atrributes to bool
    data_prepared = pd.get_dummies(data_prepared)
    
    return data_prepared
    

### Prepared Data

In [4]:
data_prepared = data_preparation(data)

In [5]:
# Spliting the data into training and testing datasets
max_train_id = data_train.shape[0]
X_test = data_prepared.iloc[max_train_id:]
X_test = X_test.drop("Cena", axis=1)
data_train_prepared = data_prepared.iloc[:max_train_id]

data_train_prepared.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135397 entries, 0 to 135396
Columns: 333 entries, Cena to Rok_oferty_2021
dtypes: bool(327), float64(6)
memory usage: 49.5 MB
<class 'pandas.core.frame.DataFrame'>
Index: 72907 entries, 0 to 72906
Columns: 332 entries, Rok_produkcji to Rok_oferty_2021
dtypes: bool(327), float64(5)
memory usage: 26.1 MB


### Spliting the dataset

In [6]:
# Features and target attribute
y = data_train_prepared["Cena"]
X = data_train_prepared.drop("Cena", axis=1)

In [7]:
# Evaluation datasets
X_train, X_evaluate, y_train, y_evaluate = train_test_split(
    X, y, test_size=0.2, random_state=42)

## Training Models

### Random Forrest Regressor

#### Finding best parameters using Randomised Search

In [8]:
rfr_model = RandomForestRegressor()
params = {'n_estimators': [100, 200, 300],'max_depth': [None, 5, 10]}
rfr_search = RandomizedSearchCV(rfr_model, param_distributions=params,n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rfr_search = rfr_search.fit(X, y)

In [9]:
print(f"Best parameters: {rfr_search.best_params_}") 
best_n_estimators= rfr_search.best_params_["n_estimators"]
best_depth = rfr_search.best_params_["max_depth"]

# Found parameters:
#best_n_estimators= 200
#best_depth = None
rfr_model = RandomForestRegressor(n_estimators = best_n_estimators, max_depth= best_depth )

In [10]:
evaluating_model = rfr_model.fit(X_train, y_train)

### Hist Gradient Boosting Regression

#### Finding the best Parameters

In [11]:
hgbr_model = HistGradientBoostingRegressor()
params = {'max_depth': [None, 5, 10],'learning_rate' : [0.001, 0.01, 0.1,],'max_iter': [20, 100, 400], 'max_leaf_nodes': [50, 100,200] }
hgbr_search = GridSearchCV(hgbr_model, param_grid=params, cv = 3, n_jobs = -1)
hgbr_search = hgbr_search.fit(X, y)


In [12]:
print(f"Best parameters: {hgbr_search.best_params_}")
best_learning_rate = hgbr_search.best_params_["learning_rate"]
best_depth = hgbr_search.best_params_["max_depth"]
best_iter = hgbr_search.best_params_["max_iter"]
best_nodes = hgbr_search.best_params_["max__leaf_nodes"]

# Found parameters:
#best_learning_rate = 0.1
#best_depth = None
#best_iter = 400
#best_nodes = 100

In [13]:
hgbr_model = HistGradientBoostingRegressor(learning_rate=best_learning_rate,max_depth=best_depth, max_iter=best_iter, verbose=2, random_state=42,max_leaf_nodes=best_nodes)
evaluating_model = hgbr_model.fit(X_train, y_train)

Binning 0.259 GB of training data: 2.675 s
Binning 0.029 GB of validation data: 0.025 s
Fitting gradient boosted rounds:
[1/400] 1 tree, 100 leaves, max depth = 10, train loss: 3323974589.26845, val loss: 3003973207.17525, in 0.206s
[2/400] 1 tree, 100 leaves, max depth = 11, train loss: 2846101215.69238, val loss: 2542048230.67780, in 0.236s
[3/400] 1 tree, 100 leaves, max depth = 10, train loss: 2457310873.18022, val loss: 2171984319.08590, in 0.192s
[4/400] 1 tree, 100 leaves, max depth = 10, train loss: 2139484790.47588, val loss: 1874624790.26282, in 0.216s
[5/400] 1 tree, 100 leaves, max depth = 11, train loss: 1879730712.26797, val loss: 1628245380.85054, in 0.199s
[6/400] 1 tree, 100 leaves, max depth = 10, train loss: 1663839749.73758, val loss: 1429323939.67644, in 0.204s
[7/400] 1 tree, 100 leaves, max depth = 13, train loss: 1484591623.63153, val loss: 1263782150.02387, in 0.197s
[8/400] 1 tree, 100 leaves, max depth = 13, train loss: 1335981994.45687, val loss: 1129058498.

### Evaluating the model

In [14]:
import numpy as np 

y_pred = evaluating_model.predict(X_evaluate)
RMSE = np.sqrt(((y_pred - y_evaluate)**2).mean())
print(f"The result of an evaluation: {RMSE}")


The result of an evaluation: 27528.563857948884


### Testing the final model

In [15]:
final_model = HistGradientBoostingRegressor(learning_rate=0.1,max_depth=None, max_iter=400, verbose=2, random_state=42,max_leaf_nodes=100)
final_model = final_model.fit(X, y)
y_pred = final_model.predict(X_test)

Binning 0.324 GB of training data: 4.908 s
Binning 0.036 GB of validation data: 0.191 s
Fitting gradient boosted rounds:
[1/400] 1 tree, 100 leaves, max depth = 11, train loss: 3311354233.55629, val loss: 3115133067.68778, in 0.358s
[2/400] 1 tree, 100 leaves, max depth = 11, train loss: 2834730986.56259, val loss: 2640393678.65048, in 0.493s
[3/400] 1 tree, 100 leaves, max depth = 12, train loss: 2444732315.09709, val loss: 2253640778.08848, in 0.482s
[4/400] 1 tree, 100 leaves, max depth = 10, train loss: 2126626309.46838, val loss: 1936396728.74527, in 0.342s
[5/400] 1 tree, 100 leaves, max depth = 11, train loss: 1861989143.12945, val loss: 1681608424.97639, in 0.270s
[6/400] 1 tree, 100 leaves, max depth = 11, train loss: 1643777076.86404, val loss: 1462811823.38445, in 0.250s
[7/400] 1 tree, 100 leaves, max depth = 12, train loss: 1461622109.77945, val loss: 1276614239.46544, in 0.237s
[8/400] 1 tree, 100 leaves, max depth = 12, train loss: 1310666506.27238, val loss: 1128065907.

In [16]:
result = pd.DataFrame({"ID":test_id,"Cena": y_pred})
print(result)

          ID           Cena
0          1  218548.199148
1          2   16558.775377
2          3   21161.558656
3          4   95835.893940
4          5   96063.159750
...      ...            ...
72902  72903  112090.495829
72903  72904   28589.505268
72904  72905   63512.810018
72905  72906   23419.491264
72906  72907  184498.921099

[72907 rows x 2 columns]


### Saving the result

In [17]:
result.to_csv("result.csv",index=False, sep=',')