In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('./diamonds.csv')

df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
df.drop('id', axis=1, inplace=True)

df = pd.get_dummies(df, columns=['cut', 'color', 'clarity'])

scaler = MinMaxScaler()
df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']] = scaler.fit_transform(df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])

bool_columns = ['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
                'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
                'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']
df[bool_columns] = df[bool_columns].astype(int)

df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.006237,0.513889,0.230769,0.0,0.367784,0.067572,0.076415,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.002079,0.466667,0.346154,0.0,0.362197,0.065195,0.072642,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0.006237,0.386111,0.423077,5.4e-05,0.377095,0.0691,0.072642,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.018711,0.538889,0.288462,0.000433,0.391061,0.071817,0.082704,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.022869,0.563889,0.288462,0.000487,0.404097,0.073854,0.086478,0,1,0,...,0,1,0,0,0,1,0,0,0,0


In [8]:
X = df.drop('price', axis=1)
y = df['price']

In [9]:
import io
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### RANDOM FOREST

In [11]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [12]:
y_pred = rf_model.predict(X_test)

In [13]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0008846770845388723


In [14]:
accuracy = rf_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9809595558510122


In [125]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

scoring = 'neg_mean_squared_error'

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, scoring=scoring, cv=5)
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test set MSE:", mse)

### SVM

In [30]:
from sklearn.svm import SVR

In [31]:
svm_regressor = SVR(kernel='rbf')

In [32]:
svm_regressor.fit(X_train, y_train)

In [33]:
y_pred = svm_regressor.predict(X_test)

In [34]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0030502261068557895


In [35]:
accuracy = svm_regressor.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9343515720658184


### ANN (Artificial Neural Network)

In [50]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [56]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(150,), activation='relu', solver='adam', random_state=42)

In [57]:
mlp_regressor.fit(X_train, y_train)

In [58]:
y_pred = mlp_regressor.predict(X_test)


In [59]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0012017754134240994


In [60]:
accuracy = mlp_regressor.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9741348136638406


In [61]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Create an MLPRegressor
mlp_regressor = MLPRegressor(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(estimator=mlp_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model
best_mlp_regressor = grid_search.best_estimator_

# Make predictions on the testing set
y_pred = best_mlp_regressor.predict(X_test)

# Evaluate the performance of the best model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

accuracy = best_mlp_regressor.score(X_test, y_test)
print("Accuracy:", accuracy)

Best Parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'solver': 'adam'}
Mean Squared Error: 0.0012017754134240994
Accuracy: 0.9741348136638406


In [120]:
mlp_regressor1 = MLPRegressor(hidden_layer_sizes=(20,20), activation='relu', solver='adam', random_state=42)

In [121]:
mlp_regressor1.fit(X_train, y_train)

In [122]:
y_pred = mlp_regressor1.predict(X_test)

In [123]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0010982483634461946


In [124]:
accuracy = mlp_regressor1.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9763629724434265
