In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [4]:
diamonds=pd.read_csv('Diamonds Prices2022 2.csv')


In [5]:
diamonds.head(3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31


In [28]:
diamonds.nunique()

Unnamed: 0    53943
carat           273
cut               5
color             7
clarity           8
x               554
y               552
z               375
yen           11602
dtype: int64

In [6]:
diamonds['yen']=np.log(diamonds['price']*135.10507326)

In [7]:
diamonds.drop(['price','depth','table'], axis=1, inplace=True)

In [8]:
diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,x,y,z,yen
0,1,0.23,Ideal,E,SI2,3.95,3.98,2.43,10.69295
1,2,0.21,Premium,E,SI1,3.89,3.84,2.31,10.69295
2,3,0.23,Good,E,VS1,4.05,4.07,2.31,10.696013
3,4,0.29,Premium,I,VS2,4.2,4.23,2.63,10.717194
4,5,0.31,Good,J,SI2,4.34,4.35,2.75,10.720183


In [9]:
from sklearn.model_selection import train_test_split

X = diamonds.drop('yen', axis=1)
y = diamonds['yen']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['carat', 'x', 'y', 'z']
categorical_features = ['cut', 'color', 'clarity']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [10]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [11]:
print('Intercept:', model.intercept_)
print('Coefficients:', model.coef_)

Intercept: 12.633528837753014
Coefficients: [-2.85824059e-01  1.05279276e+00 -7.18800987e-03  3.26718746e-01
 -4.56268439e-02  1.53202394e-03  2.89168235e-02 -2.90117133e-04
  1.54681136e-02  2.10198734e-01  1.49457971e-01  1.17453579e-01
  5.31406225e-02 -4.52768234e-02 -1.72855889e-01 -3.12118194e-01
 -6.93813098e-01  3.80356305e-01 -9.52879341e-02 -2.65235853e-01
  1.07648797e-01  4.45883509e-02  2.93927918e-01  2.27815514e-01]


In [159]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('R2:', r2)


MSE: 0.05332320884424102
R2: 0.9474545050429541


In [162]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

param_grid = {
    'n_estimators': [500],
    'max_depth': [20],
}


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_
print(best_params)


{'max_depth': 20, 'n_estimators': 500}


In [163]:
new_data = pd.DataFrame({
    'carat': [0.297],
    'cut': ['Very Good'],
    'color': ['H'],
    'clarity': ['VS1'],
    'x': [4.32],
    'y': [4.42],
    'z': [2.60]
})

new_data = preprocessor.transform(new_data)

predicted_price = model.predict(new_data)

print('Predicted price:', predicted_price)


Predicted price: [11.25532364]


In [164]:
np.exp(predicted_price)

array([77290.29227245])

In [165]:
new_data1 = pd.DataFrame({
    'carat': [0.242],
    'cut': ['Very Good'],
    'color': ['F'],
    'clarity': ['VS2'],
    'x': [4.03],
    'y': [4.08],
    'z': [2.43]
})

new_data1 = preprocessor.transform(new_data1)

predicted_price2 = model.predict(new_data1)

print('Predicted price:', predicted_price2)

Predicted price: [11.03854748]


In [166]:
np.exp(predicted_price2)

array([62227.20013288])

In [17]:
new_data2 = pd.DataFrame({
    'carat': [0.315],
    'cut': ['Premium'],
    'color': ['K'],
    'clarity': ['SI1'],
    'x': [4.35],
    'y': [4.40],
    'z': [2.69]
})

new_data2 = preprocessor.transform(new_data2)

predicted_price3 = model.predict(new_data2)

print('Predicted price:', np.exp(predicted_price3))

Predicted price: [68979.52160609]


In [25]:
new_data3 = pd.DataFrame({
    'carat': [0.36],
    'cut': ['Premium'],
    'color': ['H'],
    'clarity': ['VS1'],
    'x': [4.58],
    'y': [4.62],
    'z': [2.81]
})

new_data3 = preprocessor.transform(new_data3)

predicted_price4 = model.predict(new_data3)

print('Predicted price:', np.exp(predicted_price4))

Predicted price: [103024.4448268]
