In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


2024-09-08 13:03:59.593551: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
RANDOM_STATE = 123456789
np.random.seed(RANDOM_STATE)

# Analisi Dati

In [3]:
dataSet = pd.read_csv("./CarPrices.csv")

In [4]:
dataSet.head()

Unnamed: 0,manufacturer,model,year,transmission,mileage,fuelType,mpg,engineSize,price
0,Audi,A1,2017,Manual,15735,Petrol,55.4,1.4,12500
1,Audi,A6,2016,Automatic,36203,Diesel,64.2,2.0,16500
2,Audi,A1,2016,Manual,29946,Petrol,55.4,1.4,11000
3,Audi,A4,2017,Automatic,25952,Diesel,67.3,2.0,16800
4,Audi,A3,2019,Manual,1998,Petrol,49.6,1.0,17300


In [5]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41306 entries, 0 to 41305
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  41306 non-null  object 
 1   model         41306 non-null  object 
 2   year          41306 non-null  int64  
 3   transmission  41306 non-null  object 
 4   mileage       41306 non-null  int64  
 5   fuelType      41306 non-null  object 
 6   mpg           41306 non-null  float64
 7   engineSize    41306 non-null  float64
 8   price         41306 non-null  int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 2.8+ MB


In [6]:
dataSet["manufacturer"].value_counts()

manufacturer
Mercedes    13119
BMW         10781
Audi        10668
Toyota       6738
Name: count, dtype: int64

In [7]:
dataSet["model"].value_counts()

model
 C Class     3747
 A Class     2561
 3 Series    2443
 Yaris       2122
 1 Series    1969
             ... 
 RS7            1
180             1
230             1
220             1
200             1
Name: count, Length: 95, dtype: int64

In [8]:
dataSet["year"].value_counts()

year
2019    12521
2017     8056
2016     6814
2018     4194
2015     3437
2020     2296
2014     1791
2013     1144
2012      303
2011      176
2010      129
2009       99
2008       79
2007       74
2006       42
2004       37
2005       34
2003       21
2002       21
2001       14
1998        7
2000        6
1999        6
1997        3
1996        1
1970        1
Name: count, dtype: int64

In [9]:
dataSet["transmission"].value_counts()

transmission
Semi-Auto    15359
Automatic    13778
Manual       12166
Other            3
Name: count, dtype: int64

In [10]:
dataSet["fuelType"].value_counts()

fuelType
Diesel      22294
Petrol      16319
Hybrid       2542
Other         148
Electric        3
Name: count, dtype: int64

# Preparazione dei Dati

In [11]:
encoder = OneHotEncoder(sparse_output=False)

# Applicare OneHotEncoder solo alle colonne categoriche
column_transformer = ColumnTransformer(
    transformers=[
        ('encoder', encoder, ["manufacturer", 'transmission', "fuelType", "model"])
    ],
    remainder='passthrough'  # Lascia inalterate le altre colonne, se presenti
)

finalPrepared = pd.DataFrame(column_transformer.fit_transform(dataSet))
finalPrepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2017.0,15735.0,55.4,1.4,12500.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2016.0,36203.0,64.2,2.0,16500.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2016.0,29946.0,55.4,1.4,11000.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2017.0,25952.0,67.3,2.0,16800.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2019.0,1998.0,49.6,1.0,17300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41301,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2020.0,500.0,55.4,2.0,35999.0
41302,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2020.0,2500.0,55.4,2.0,24699.0
41303,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2019.0,11612.0,41.5,2.1,30999.0
41304,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2019.0,2426.0,45.6,2.0,37990.0


In [12]:
y = finalPrepared[112]
X = finalPrepared.drop(112, axis = "columns", inplace = False)

trainSet_X, testSet_X, trainSet_y , testSet_y = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_STATE)

# Utils

In [13]:
def scoreModel(model, set_X, set_y, cv) -> pd.Series :
    negScores = cross_val_score(model, set_X, set_y, scoring = "neg_mean_squared_error", cv = cv)
    scores = np.sqrt(-negScores)
    return pd.Series(scores)

# Regressione Lineare

In [14]:
linerRegressor = LinearRegression()
linerRegressor.fit(trainSet_X, trainSet_y)

In [15]:
linearPredicted = linerRegressor.predict(testSet_X)
linearMAE = mean_absolute_error(testSet_y, linearPredicted)
linearMAE

3050.590424637567

In [16]:
scoreModel(linerRegressor, trainSet_X, trainSet_y, 10).describe()

count      10.000000
mean     4752.844239
std       351.815624
min      4330.849611
25%      4414.367478
50%      4793.154893
75%      5046.076796
max      5277.034669
dtype: float64

# Albero di Regressione

In [22]:
treeRegressor = DecisionTreeRegressor(random_state = RANDOM_STATE)
treeRegressor.fit(trainSet_X, trainSet_y)

In [23]:
treePredictions = treeRegressor.predict(testSet_X)
treeMAE = mean_absolute_error(testSet_y, treePredictions)
treeMAE

1787.9193049218097

In [19]:
scoreModel(treeRegressor, trainSet_X, trainSet_y, 10).describe()

count      10.000000
mean     3066.826223
std       174.522598
min      2884.308502
25%      2953.899927
50%      3028.157371
75%      3117.128315
max      3446.715397
dtype: float64

# Random Forest

In [20]:
randomForestRegressor = RandomForestRegressor(random_state = RANDOM_STATE)
randomForestRegressor.fit(trainSet_X, trainSet_y)

In [21]:
randomForestPredictions = randomForestRegressor.predict(testSet_X)
randomForestMAE = mean_absolute_error(testSet_y, randomForestPredictions)
randomForestMAE

1463.004048723741

In [108]:
scoreModel(randomForestRegressor, trainSet_X, trainSet_y, 10).describe()

count      10.000000
mean     2463.604983
std       191.608673
min      2247.155980
25%      2331.323039
50%      2455.201596
75%      2492.974672
max      2911.185951
dtype: float64