In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./diamonds.csv')

df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
df.drop('id', axis=1, inplace=True)

df = pd.get_dummies(df, columns=['cut', 'color', 'clarity'])

scaler = MinMaxScaler()
df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']] = scaler.fit_transform(df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])

bool_columns = ['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
                'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
                'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']
df[bool_columns] = df[bool_columns].astype(int)

df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.006237,0.513889,0.230769,0.0,0.367784,0.067572,0.076415,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.002079,0.466667,0.346154,0.0,0.362197,0.065195,0.072642,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.006237,0.386111,0.423077,5.4e-05,0.377095,0.0691,0.072642,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.018711,0.538889,0.288462,0.000433,0.391061,0.071817,0.082704,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.022869,0.563889,0.288462,0.000487,0.404097,0.073854,0.086478,0,1,0,...,0,1,0,0,0,1,0,0,0,0


In [5]:
X = df.drop('price', axis=1)
y = df['price']

In [6]:
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### RANDOM FOREST

In [8]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [9]:
y_pred = rf_model.predict(X_test)

In [10]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0008846770845388723


In [11]:
accuracy = rf_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9809595558510122


### XGBOOST

In [12]:
import xgboost as xgb

In [13]:
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

In [14]:
y_pred = xgb_model.predict(X_test)

In [15]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0009302878001750457


In [16]:
accuracy = xgb_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9799779001725241


### DECISION TREE

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

In [19]:
y_pred = dt_model.predict(X_test)

In [20]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0016646538524922355


In [21]:
accuracy = dt_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9641725220877664


### ADABOOST

In [22]:
from sklearn.ensemble import AdaBoostRegressor

In [23]:
ada_model = AdaBoostRegressor(n_estimators=100, random_state=42)
ada_model.fit(X_train, y_train)

In [24]:
y_pred = ada_model.predict(X_test)

In [25]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.007560360601800554


In [26]:
accuracy = ada_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8372822962179195
