# Random Forest Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import Cleaner as cl

## Importing the dataset

In [2]:
dataset = cl.get_and_clean_df('../data/bilhandel_unclean.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

display(dataset.columns)

0         Yaris
1           C30
2          E200
3          Vito
4         Viano
         ...   
1994       XC60
1995        UP!
1996    Dobbelt
1997       L200
1998         A3
Name: Model, Length: 1999, dtype: object

Index(['Årgang', 'Vægt', 'Tophastighed', 'Kilometer', 'Nypris', 'Cylindre',
       '0 - 100 km/t', 'Airbags', 'Grøn Ejerafgift', 'Lasteevne', 'Model',
       'Gearkasse', 'Hestekræfter', 'Biltype', 'Max. påhæng', 'Make',
       'Antal døre', 'Km/l', 'Brændstoftype', 'Antal gear', 'Pris'],
      dtype='object')

## Splitting the dataset into the Training set and Test set

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [10,11,13,15,18])], remainder='passthrough', sparse_threshold=0)
X = np.array(ct.fit_transform(X))
display(X.shape)

(1026, 267)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [5]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

## Predicting the Test set results

In [6]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[158.72000000000003 '109.900']
 [97.24999999999999 '44.995']
 [236.04000000000002 '237.900']
 [76.90579999999999 '129.900']
 [65.74 '70.000']
 [59.05999999999999 '64.900']
 [17.6794 '3.500']
 [270.0195 '1.995']
 [278.49 '2.995']
 [33.89999999999999 '24.900']
 [43.0699 '36.900']
 [208.0045 '209.900']
 [259.19000000000005 '319.900']
 [53.45 '49.900']
 [296.35583333333335 '329.900']
 [157.41000000000003 '139.900']
 [140.81 '129.900']
 [136.60000000000002 '134.900']
 [62.85 '54.800']
 [21.1299 '24.500']
 [47.73700000000001 '248.800']
 [391.9800000000001 '389.900']
 [52.48 '74.900']
 [99.17375 '127.900']
 [222.18850000000003 '200.000']
 [371.63 '319.900']
 [259.21900000000005 '229.800']
 [38.3085 '99.500']
 [16.979799999999997 '24.900']
 [182.26800000000003 '169.900']
 [46.0999 '59.900']
 [119.99000000000001 '129.900']
 [33.89999999999999 '37.900']
 [32.160000000000004 '25.000']
 [116.24010000000001 '144.700']
 [20.809500000000003 '11.900']
 [161.39000000000004 '169.900']
 [216.8245 '214.9

## Evaluating the Model Performance

In [7]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.4578069101758455