In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
def get_mae_tree(max,Tx,Vx,Ty,Vy):
  model = DecisionTreeRegressor(max_leaf_nodes=max,random_state=1)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

def get_mae_forest(n,max,Tx,Vx,Ty,Vy):
  model = RandomForestRegressor(n_estimators=n,max_leaf_nodes=max,random_state=0)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

In [None]:
df=pd.read_csv("/content/gdrive/MyDrive/material/cars.csv")
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [None]:
object_columns = [c for c in df.columns if df[c].dtype == "object"]
object_columns

['Make',
 'Model',
 'Engine Fuel Type',
 'Transmission Type',
 'Driven_Wheels',
 'Market Category',
 'Vehicle Size',
 'Vehicle Style']

In [None]:
for c in object_columns :
  print(f'\'{c}\' number of unique values = {df[c].nunique()}')

'Make' number of unique values = 48
'Model' number of unique values = 915
'Engine Fuel Type' number of unique values = 10
'Transmission Type' number of unique values = 5
'Driven_Wheels' number of unique values = 4
'Market Category' number of unique values = 71
'Vehicle Size' number of unique values = 3
'Vehicle Style' number of unique values = 16


In [None]:
df.drop(['Make','Model','Market Category'],axis=1,inplace=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               11914 non-null  int64  
 1   Engine Fuel Type   11911 non-null  object 
 2   Engine HP          11845 non-null  float64
 3   Engine Cylinders   11884 non-null  float64
 4   Transmission Type  11914 non-null  object 
 5   Driven_Wheels      11914 non-null  object 
 6   Number of Doors    11908 non-null  float64
 7   Vehicle Size       11914 non-null  object 
 8   Vehicle Style      11914 non-null  object 
 9   highway MPG        11914 non-null  int64  
 10  city mpg           11914 non-null  int64  
 11  Popularity         11914 non-null  int64  
 12  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 1.2+ MB


In [None]:
numaric_columns = [c for c in df if df[c].dtype != "object" ]
numaric_columns

['Year',
 'Engine HP',
 'Engine Cylinders',
 'Number of Doors',
 'highway MPG',
 'city mpg',
 'Popularity',
 'MSRP']

In [None]:
df["Engine HP"].fillna(220,inplace=True)   #mean()249 #mode() 200
df["Engine Cylinders"].fillna(5,inplace=True)    #value_counts() 4.0 >>4752 || 6.0>>4489    #mean() 5.628828677213059   #mode() 4.0
df["Number of Doors"].fillna(3.4,inplace=True)  #.value_counts() 4.0>> 8353 || 2.0>> 3160   #mean 3.4360933 #mode 4.0

In [None]:
l= numaric_columns.copy()
l.remove("MSRP")
X=df[l]
y=df["MSRP"]
Tx,Vx,Ty,Vy=train_test_split(X,y,train_size=.8,test_size=.2,random_state=1)
print(get_mae_tree(918,Tx,Vx,Ty,Vy))# old >>  3892.487678844498
print(get_mae_forest(181,920,Tx,Vx,Ty,Vy))#old >> 3589.1110486496896  170,918
# 3540 1)200 2)6 3)4
# 3553 1)249 2)6 3)4
# 3539.55 1)200 2)4 3)4


3979.258219127734
3574.677979144003


In [None]:
object_columns = [c for c in df.columns if df[c].dtype == "object"]
for c in object_columns :
  print(f'\'{c}\' number of unique values = {df[c].nunique()}')

'Engine Fuel Type' number of unique values = 10
'Transmission Type' number of unique values = 5
'Driven_Wheels' number of unique values = 4
'Vehicle Size' number of unique values = 3
'Vehicle Style' number of unique values = 16


In [None]:
object_columns = [c for c in df.columns if df[c].dtype == "object"]
for c in object_columns :
  print(f' \'{c}\' number of unique = {df[c].nunique()} , null values = {df[c].isnull().sum()} \n  unique values = {df[c].unique() } \n')

 'Engine Fuel Type' number of unique = 10 , null values = 3 
  unique values = ['premium unleaded (required)' 'regular unleaded'
 'premium unleaded (recommended)' 'flex-fuel (unleaded/E85)' 'diesel'
 'electric' 'flex-fuel (premium unleaded recommended/E85)' 'natural gas'
 'flex-fuel (premium unleaded required/E85)'
 'flex-fuel (unleaded/natural gas)' nan] 

 'Transmission Type' number of unique = 5 , null values = 0 
  unique values = ['MANUAL' 'AUTOMATIC' 'AUTOMATED_MANUAL' 'DIRECT_DRIVE' 'UNKNOWN'] 

 'Driven_Wheels' number of unique = 4 , null values = 0 
  unique values = ['rear wheel drive' 'front wheel drive' 'all wheel drive'
 'four wheel drive'] 

 'Vehicle Size' number of unique = 3 , null values = 0 
  unique values = ['Compact' 'Midsize' 'Large'] 

 'Vehicle Style' number of unique = 16 , null values = 0 
  unique values = ['Coupe' 'Convertible' 'Sedan' 'Wagon' '4dr Hatchback' '2dr Hatchback'
 '4dr SUV' 'Passenger Minivan' 'Cargo Minivan' 'Crew Cab Pickup'
 'Regular Cab Pick

In [None]:
df["Engine Fuel Type"].fillna('regular unleaded',inplace=True)
l= numaric_columns+["Engine Fuel Type","Vehicle Size"]+["Driven_Wheels","Engine Fuel Type","Transmission Type","Vehicle Size","Vehicle Style"]
#l.remove("MSRP")
#X=df[l]
#y=df["MSRP"]
y= df["MSRP"];df.drop("MSRP",axis=1,inplace=True); X=df

In [None]:
Tx,Vx,Ty,Vy=train_test_split(X,y,train_size=.8,test_size=.2,random_state=12)

**Ordinal Encoding**

In [None]:
ordinal_encoder = OrdinalEncoder()

# Make copy to avoid changing original data
label_X_train = Tx.copy()
label_X_valid = Vx.copy()
object_col=["Engine Fuel Type","Vehicle Size"]
#label_X_valid.columns
label_X_train.drop(["Driven_Wheels","Engine Fuel Type","Transmission Type","Vehicle Size","Vehicle Style"],axis=1,inplace=True)
label_X_valid.drop(["Driven_Wheels","Engine Fuel Type","Transmission Type","Vehicle Size","Vehicle Style"],axis=1,inplace=True)

label_X_train[object_col] = ordinal_encoder.fit_transform(Tx[object_col])
label_X_valid[object_col] = ordinal_encoder.transform(Vx[object_col])
#print(label_X_valid.columns)
print(get_mae_forest(179,955,label_X_train, label_X_valid, Ty, Vy))

3239.7271033747684


**One Hot Encoding**

In [None]:
Tx,Vx,Ty,Vy=train_test_split(X,y,train_size=.8,test_size=.2,random_state=12)
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
object_cols = ["Transmission Type","Vehicle Style","Driven_Wheels"]#5 , 16
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(Tx[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(Vx[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = Tx.index
OH_cols_valid.index = Vx.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = Tx.drop(object_cols, axis=1)
num_X_valid = Vx.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_train[["Engine Fuel Type","Vehicle Size"]] = ordinal_encoder.fit_transform(Tx[["Engine Fuel Type","Vehicle Size"]])
OH_X_valid[["Engine Fuel Type","Vehicle Size"]] = ordinal_encoder.transform(Vx[["Engine Fuel Type","Vehicle Size"]])
#print("MAE from Approach 3 (One-Hot Encoding):")
OH_X_train.drop([0,24],axis=1,inplace=True)
OH_X_valid.drop([0,24],axis=1,inplace=True)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

#OH_X_train,OH_X_valid,Ty,Vy=train_test_split(pd.concat([OH_X_valid,OH_X_train],axis=0),y,train_size=.8,test_size=.2,random_state=24)
#OH_X_train
# 'Year', 'Engine Fuel Type', 'Engine HP', 'Engine Cylinders',
# 'Number of Doors', 'Vehicle Size', 'highway MPG', 'city mpg',
# 'Popularity', '0', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
#  '12', '13', '14', '15', '16', '17', '19', '20', '21', '22', '23'"""
OH_X_train.shape
#3360.867269434654
#3072.291701366579  63



(9531, 32)

In [None]:
OH_X_train

Unnamed: 0,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Number of Doors,Vehicle Size,highway MPG,city mpg,Popularity,1,...,14,15,16,17,18,19,20,21,22,23
1385,2016,7.0,220.0,4.0,4.0,2.0,28,21,3105,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2888,2010,3.0,621.0,12.0,2.0,2.0,19,12,520,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8677,2016,9.0,295.0,6.0,4.0,2.0,26,19,454,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1360,2004,9.0,140.0,4.0,4.0,2.0,31,21,26,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5016,2017,9.0,261.0,6.0,4.0,0.0,22,16,2009,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3714,2017,8.0,402.0,8.0,2.0,2.0,26,18,617,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7409,2008,9.0,200.0,6.0,4.0,1.0,22,15,1013,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3325,1994,9.0,160.0,6.0,4.0,2.0,26,17,26,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
9606,2017,4.0,285.0,6.0,2.0,1.0,22,17,1385,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [None]:
print(get_mae_tree(1124,OH_X_train, OH_X_valid, Ty, Vy))
print(get_mae_forest(263,1100,OH_X_train, OH_X_valid, Ty, Vy))#220 > 3006               250     3525983902155
#2968.740728815852
#2975.6889621760624

2968.740728815852
2975.6889621760624


In [None]:
def get_mae_tree(max,Tx,Vx,Ty,Vy):
  model = DecisionTreeRegressor(max_leaf_nodes=max,random_state=1)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

def get_mae_forest(n,max,Tx,Vx,Ty,Vy):
  model = RandomForestRegressor(n_estimators=n,max_leaf_nodes=max,random_state=0)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

In [None]:
lowest_mae=100000
for max in range(800,1200):
  mae=get_mae_tree(max,OH_X_train,OH_X_valid,Ty,Vy)
  if lowest_mae > mae :
    lowest_mae = mae
    best_node = max
print(f'best leaf node = {best_node} , MAE = {lowest_mae}')

best leaf node = 1124 , MAE = 2968.740728815852


In [None]:
lowest_mae=100000
for n in range(170,190):
  mae=get_mae_forest(n,1100,OH_X_train,OH_X_valid,Ty,Vy)
  if lowest_mae > mae :
    lowest_mae = mae
    best_tree = n
  #else :
  #  break
  #print(f'number of tree = {n}, MAE = {mae}')
print(f'best leaf node = 1100 ,best number of tree = {best_tree}, MAE = {lowest_mae}')

best leaf node = 1100 ,best number of tree = 177, MAE = 2976.987196830112


In [None]:

# Load the data
df=pd.read_csv("/content/gdrive/MyDrive/material/cars.csv")

# Drop irrelevant columns
df = df.drop(['Market Category'], axis=1)

# Drop missing values
df["Engine HP"].fillna(220,inplace=True)   #mean()249 #mode() 200
df["Engine Cylinders"].fillna(5,inplace=True)    #value_counts() 4.0 >>4752 || 6.0>>4489    #mean() 5.628828677213059   #mode() 4.0
df["Number of Doors"].fillna(3.4,inplace=True)
df["Engine Fuel Type"].fillna('regular unleaded',inplace=True)


# Group car makes by country of origin
japanese_cars = ['Acura', 'Honda', 'Infiniti', 'Lexus', 'Mazda', 'Mitsubishi', 'Nissan', 'Scion', 'Subaru', 'Suzuki', 'Toyota']
american_cars = ['Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ford', 'GMC', 'Jeep', 'Lincoln', 'Ram']
european_cars = ['Audi', 'BMW', 'Fiat', 'Jaguar', 'Land Rover', 'Mercedes-Benz', 'MINI', 'Porsche', 'Saab', 'Smart', 'Volkswagen', 'Volvo']

# Encode the car makes using one-hot encoding
def encode_car_make(make):
    if make in japanese_cars:
        return 'Japanese'
    elif make in american_cars:
        return 'American'
    elif make in european_cars:
        return 'European'
    else:
        return 'Other'


#OrdinalAtt=["Engine Fuel Type","Vehicle Size"]
#Non_OrdinalAtt=["Transmission Type","Vehicle Style","Driven_Wheels","Make"]

df['Car Make Group'] = df['Make'].apply(encode_car_make)

one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_cols = pd.DataFrame(one_hot_enc.fit_transform(df[['Car Make Group']]))
one_hot_cols.index = df.index
df = pd.concat([df, one_hot_cols], axis=1)

# Use ordinal encoding for 'Transmission Type' and 'Vehicle Size'
ordinal_enc = OrdinalEncoder()
df[['Transmission Type', 'Vehicle Size']] = ordinal_enc.fit_transform(df[['Transmission Type', 'Vehicle Size']])

# Use one-hot encoding for the other categorical features
categorical_cols = [ 'Engine Fuel Type', 'Driven_Wheels', 'Vehicle Style']
one_hot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
one_hot_cols = pd.DataFrame(one_hot_enc.fit_transform(df[categorical_cols]))
one_hot_cols.index = df.index
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, one_hot_cols], axis=1)

# Split the data into features and target
X = df.drop(['MSRP', 'Make', 'Car Make Group','Model'], axis=1)
y = df['MSRP']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=.8 ,test_size=0.2, random_state=42)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
X_train



Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Number of Doors,Vehicle Size,highway MPG,city mpg,Popularity,0,...,20,21,22,23,24,25,26,27,28,29
3181,2016,265.0,4.0,1.0,4.0,1.0,31,22,1624,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5357,2017,449.0,8.0,1.0,4.0,1.0,18,14,617,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4874,2016,173.0,4.0,1.0,2.0,0.0,34,25,1720,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8102,1993,180.0,6.0,3.0,2.0,1.0,16,11,1851,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10400,2008,172.0,6.0,1.0,2.0,0.0,24,17,1439,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,2014,181.0,4.0,1.0,4.0,2.0,26,20,2031,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5191,2009,219.0,6.0,1.0,4.0,2.0,26,17,210,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,2016,220.0,4.0,0.0,2.0,0.0,33,25,873,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,2009,260.0,4.0,1.0,4.0,2.0,27,17,376,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
print(get_mae_tree(1110,X_train, X_test, y_train, y_test))
print(get_mae_forest(180,1100,X_train, X_test, y_train, y_test))

3128.249061920204
3042.2514603287705


In [None]:
lowest_mae=100000
for max in range(800,1200):
  mae=get_mae_tree(max,X_train,X_test,y_train,y_test)
  if lowest_mae > mae :
    lowest_mae = mae
    best_node = max
print(f'best leaf node = {best_node} , MAE = {lowest_mae}')

best leaf node = 1004 , MAE = 3119.8306073121903


In [None]:
lowest_mae=100000
leaf_node=1000
for n in range(177,190):
  mae=get_mae_forest(n,leaf_node,X_train,X_test,y_train,y_test)
  if lowest_mae > mae :
    lowest_mae = mae
    best_tree = n
print(f'best leaf node = {leaf_node} ,best number of tree = {best_tree}, MAE = {lowest_mae}')

best leaf node = 1000 ,best number of tree = 184, MAE = 2995.2372131133748


In [None]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from google.colab import drive

#functions
def get_mae_tree(max,Tx,Vx,Ty,Vy):
  model = DecisionTreeRegressor(max_leaf_nodes=max,random_state=1)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

def get_mae_forest(n,max,Tx,Vx,Ty,Vy):
  model = RandomForestRegressor(n_estimators=n,max_leaf_nodes=max,random_state=0)
  model.fit(Tx,Ty)
  pre_y = model.predict(Vx)
  mae = mean_absolute_error(Vy,pre_y)
  return mae

#connect with drive
drive.mount('/content/gdrive')

#read file
df=pd.read_csv("/content/gdrive/MyDrive/material/cars.csv")

#split to numaric & non-numaric
object_columns = [c for c in df.columns if df[c].dtype == "object"]
numaric_columns = [c for c in df if df[c].dtype != "object" ]

#few analysis for numaric
for c in numaric_columns :
  print(f'\'{c}\' null values = {df[c].isnull().sum()}')

#preprccing for it
df["Engine HP"].fillna(249,inplace=True)   #mean()249 #mode() 200
df["Engine Cylinders"].fillna(5,inplace=True)    #value_counts() 4.0 >>4752 || 6.0>>4489    #mean() 5.628828677213059   #mode() 4.0
df["Number of Doors"].fillna(3.4,inplace=True)  #.value_counts() 4.0>> 8353 || 2.0>> 3160   #mean 3.4360933 #mode 4.0

print()
#few analysis for non-numaric
for c in object_columns :
  print(f'\'{c}\' number of unique values = {df[c].nunique()} , null values = {df[c].isnull().sum()}')

#droping & preproccing
df.drop(["Model","Market Category"],axis=1,inplace=True)
df["Engine Fuel Type"].fillna('regular unleaded',inplace=True)

#split non-numaric to Ordinal & nominal :
OrdinalAtt=["Engine Fuel Type","Vehicle Size"]
Non_OrdinalAtt=["Transmission Type","Vehicle Style","Driven_Wheels","Make"]

print("\nINFO :\n")
df.info()
df.shape

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
'Year' null values = 0
'Engine HP' null values = 69
'Engine Cylinders' null values = 30
'Number of Doors' null values = 6
'highway MPG' null values = 0
'city mpg' null values = 0
'Popularity' null values = 0
'MSRP' null values = 0

'Make' number of unique values = 48 , null values = 0
'Model' number of unique values = 915 , null values = 0
'Engine Fuel Type' number of unique values = 10 , null values = 3
'Transmission Type' number of unique values = 5 , null values = 0
'Driven_Wheels' number of unique values = 4 , null values = 0
'Market Category' number of unique values = 71 , null values = 3742
'Vehicle Size' number of unique values = 3 , null values = 0
'Vehicle Style' number of unique values = 16 , null values = 0

INFO :

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 14 columns):
 #   Column            

(11914, 14)

In [None]:
#Split to train and test
y= df["MSRP"]; X=df.drop("MSRP",axis=1)
df.shape #(11914, 14)
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=.8 ,test_size=0.2, random_state=12)

#one hot encoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[Non_OrdinalAtt]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_test[Non_OrdinalAtt]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(Non_OrdinalAtt, axis=1)
num_X_valid = X_test.drop(Non_OrdinalAtt, axis=1)

# Add one-hot encoded columns to numerical features
ordinalencoder = OrdinalEncoder()
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_train[OrdinalAtt] = ordinalencoder.fit_transform(X_train[OrdinalAtt])
OH_X_valid[OrdinalAtt] = ordinalencoder.transform(X_test[OrdinalAtt])

#print("MAE from Approach 3 (One-Hot Encoding):")
OH_X_train.drop([0,16,21,25],axis=1,inplace=True) # i know that i drop 2 columns of the same categorical attribute but it's give me best error
OH_X_valid.drop([0,16,21,25],axis=1,inplace=True)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)



In [None]:
OH_X_train

Unnamed: 0,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Number of Doors,Vehicle Size,highway MPG,city mpg,Popularity,1,...,63,64,65,66,67,68,69,70,71,72
1385,2016,7.0,220.0,4.0,4.0,2.0,28,21,3105,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2888,2010,3.0,621.0,12.0,2.0,2.0,19,12,520,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8677,2016,9.0,295.0,6.0,4.0,2.0,26,19,454,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1360,2004,9.0,140.0,4.0,4.0,2.0,31,21,26,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5016,2017,9.0,261.0,6.0,4.0,0.0,22,16,2009,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3714,2017,8.0,402.0,8.0,2.0,2.0,26,18,617,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7409,2008,9.0,200.0,6.0,4.0,1.0,22,15,1013,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3325,1994,9.0,160.0,6.0,4.0,2.0,26,17,26,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9606,2017,4.0,285.0,6.0,2.0,1.0,22,17,1385,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(get_mae_tree(1193,OH_X_train, OH_X_valid, y_train, y_test))
print(get_mae_forest(196,1078,OH_X_train, OH_X_valid, y_train, y_test))#220 > 3006

2975.629070692334
2920.972319600663


In [None]:
lowest_mae=100000
for max in range(900,1250):
  mae=get_mae_tree(max,OH_X_train, OH_X_valid, y_train, y_test)
  if lowest_mae > mae :
    lowest_mae = mae
    best_node = max
print(f'best leaf node = {best_node} , MAE = {lowest_mae}')

best leaf node = 1193 , MAE = 2975.629070692334


In [None]:
lowest_mae=100000
leaf_node=1078
for n in range(185,210):
  mae=get_mae_forest(n,leaf_node,OH_X_train, OH_X_valid, y_train, y_test)
  if lowest_mae > mae :
    lowest_mae = mae
    best_tree = n
print(f'best leaf node = {leaf_node} ,best number of tree = {best_tree}, MAE = {lowest_mae}')

best leaf node = 1078 ,best number of tree = 196, MAE = 2920.972319600663
