In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
import numpy as np

In [2]:
# Cleaning first column
data = pd.read_csv('TRAIN.csv')
data.pop('Unnamed: 0')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
cut_transform = {
    'Good': 1,
    'Ideal': 2,
    'Premium': 3
}

data['cut'] = data['cut'].map(cut_transform)

In [4]:
np.unique(data['clarity'])

array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
      dtype=object)

In [5]:
# Use for clarity One-hot-encoder
clarity_transform = {
    'I3':1,
    'I2':2,
    'I1':3,
    'SI2':4,
    'SI1':5,
    'VS2':6,
    'VS1':7,
    'VVS2':8,
    'VVS1':9,
    'IF':10,
    'FL':11
}

data['clarity'] = data['clarity'].map(clarity_transform)

In [6]:
encoder = OneHotEncoder(sparse_output=False)

encoded_color = encoder.fit_transform(data[['color']])
encoded_df = pd.DataFrame(encoded_color, columns=encoder.get_feature_names_out(['color']))

data = pd.concat([data,encoded_df], axis=1)
data.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.23,2.0,E,4,61.5,55.0,326,3.95,3.98,2.43,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.21,3.0,E,5,59.8,61.0,326,3.89,3.84,2.31,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.23,1.0,E,7,56.9,65.0,327,4.05,4.07,2.31,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.29,3.0,I,6,62.4,58.0,334,4.2,4.23,2.63,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.31,1.0,J,4,63.3,58.0,335,4.34,4.35,2.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
# drop the 'color' column
data = data.drop(columns='color')
data.head(5)

Unnamed: 0,carat,cut,clarity,depth,table,price,x,y,z,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.23,2.0,4,61.5,55.0,326,3.95,3.98,2.43,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.21,3.0,5,59.8,61.0,326,3.89,3.84,2.31,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.23,1.0,7,56.9,65.0,327,4.05,4.07,2.31,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.29,3.0,6,62.4,58.0,334,4.2,4.23,2.63,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.31,1.0,4,63.3,58.0,335,4.34,4.35,2.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# shuffle dataset
data = shuffle(data, random_state=42)
data.head(5)

Unnamed: 0,carat,cut,clarity,depth,table,price,x,y,z,color_D,color_E,color_F,color_G,color_H,color_I,color_J
1388,0.24,2.0,9,62.1,56.0,559,3.97,4.0,2.47,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50052,0.58,,8,60.0,57.0,2201,5.44,5.42,3.26,0.0,0.0,1.0,0.0,0.0,0.0,0.0
41645,0.4,2.0,8,62.1,55.0,1238,4.76,4.74,2.95,0.0,1.0,0.0,0.0,0.0,0.0,0.0
42377,0.43,3.0,8,60.8,57.0,1304,4.92,4.89,2.98,0.0,1.0,0.0,0.0,0.0,0.0,0.0
17244,1.55,2.0,4,62.3,55.0,6901,7.44,7.37,4.61,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# erase the 'price' column from data
price = data.pop('price')

In [10]:
# split features and targets into two sets
from sklearn.model_selection import train_test_split

df_train, df_test, price_train, price_test = train_test_split(data, price, test_size=0.3)

In [11]:
# import the decisioin tree
from sklearn.tree import DecisionTreeRegressor

trees = [  
    DecisionTreeRegressor(criterion='squared_error', max_depth=12),
    DecisionTreeRegressor(criterion='friedman_mse', max_depth=16),
    DecisionTreeRegressor(criterion='poisson', max_depth=22),
    DecisionTreeRegressor(criterion='squared_error', max_depth=45),
    DecisionTreeRegressor(criterion='friedman_mse', max_depth=95),
    DecisionTreeRegressor(criterion='poisson', max_depth=33)
]

In [12]:
# fit all models
for tree in trees:
    tree.fit(df_train, price_train)

In [None]:
# import the r2_score metric
from sklearn.metrics import r2_score

score = [ r2_score(y_pred=tree.predict(df_test), y_true=price_test) for tree in trees]

In [15]:
# result
print(*score)

0.972314784619526 0.9686325228692033 0.965331317781574 0.9641677374899997 0.9638479841850943 0.9650650047813927
