In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import warnings as w
w.filterwarnings('ignore')

In [3]:
df = pd.read_csv("diamonds.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
df.shape

(50000, 10)

In [7]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.799444,61.753006,57.45783,3944.80544,5.734403,5.737956,3.541056
std,0.475173,1.431088,2.232092,3997.938105,1.123077,1.145579,0.707065
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2410.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5351.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    50000 non-null  float64
 1   cut      50000 non-null  object 
 2   color    50000 non-null  object 
 3   clarity  50000 non-null  object 
 4   depth    50000 non-null  float64
 5   table    50000 non-null  float64
 6   price    50000 non-null  int64  
 7   x        50000 non-null  float64
 8   y        50000 non-null  float64
 9   z        50000 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


In [11]:
df.nunique()

carat        272
cut            5
color          7
clarity        8
depth        181
table        126
price      11297
x            553
y            551
z            371
dtype: int64

In [13]:
df.duplicated().sum()

126

In [15]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [17]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [19]:
df['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [21]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [23]:
# Определяем порядок
quality_mapping = {
    'Ideal': 0,
    'Premium': 1,
    'Very Good': 2,
    'Good': 3,
    'Fair': 4
}
'Ideal', 'Premium', 'Very Good', 'Good', 'Fair'
# Применяем маппинг
df['cut'] = df['cut'].map(quality_mapping)

# Проверяем результат
df['cut'].head()

0    0
1    1
2    3
3    1
4    3
Name: cut, dtype: int64

In [25]:
df_encoded = pd.get_dummies(df, columns=['color', 'clarity'])

print(df_encoded)

       carat  cut  depth  table  price     x     y     z  color_D  color_E  \
0       0.23    0   61.5   55.0    326  3.95  3.98  2.43    False     True   
1       0.21    1   59.8   61.0    326  3.89  3.84  2.31    False     True   
2       0.23    3   56.9   65.0    327  4.05  4.07  2.31    False     True   
3       0.29    1   62.4   58.0    334  4.20  4.23  2.63    False    False   
4       0.31    3   63.3   58.0    335  4.34  4.35  2.75    False    False   
...      ...  ...    ...    ...    ...   ...   ...   ...      ...      ...   
49995   0.72    0   60.8   57.0   2757  5.75  5.76  3.50     True    False   
49996   0.72    3   63.1   55.0   2757  5.69  5.75  3.61     True    False   
49997   0.70    2   62.8   60.0   2757  5.66  5.68  3.56     True    False   
49998   0.86    1   61.0   58.0   2757  6.15  6.12  3.74    False    False   
49999   0.75    0   62.2   55.0   2757  5.83  5.87  3.64     True    False   

       ...  color_I  color_J  clarity_I1  clarity_IF  clarity_S

In [27]:
df_encoded.head()

Unnamed: 0,carat,cut,depth,table,price,x,y,z,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,0,61.5,55.0,326,3.95,3.98,2.43,False,True,...,False,False,False,False,False,True,False,False,False,False
1,0.21,1,59.8,61.0,326,3.89,3.84,2.31,False,True,...,False,False,False,False,True,False,False,False,False,False
2,0.23,3,56.9,65.0,327,4.05,4.07,2.31,False,True,...,False,False,False,False,False,False,True,False,False,False
3,0.29,1,62.4,58.0,334,4.2,4.23,2.63,False,False,...,True,False,False,False,False,False,False,True,False,False
4,0.31,3,63.3,58.0,335,4.34,4.35,2.75,False,False,...,False,True,False,False,False,True,False,False,False,False


In [29]:
# Преобразование всех булевых столбцов в тип int
df_encoded[df_encoded.select_dtypes('bool').columns] = df_encoded.select_dtypes('bool').astype(int)

# Проверяем результат
df_encoded.head()


Unnamed: 0,carat,cut,depth,table,price,x,y,z,color_D,color_E,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,0,61.5,55.0,326,3.95,3.98,2.43,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,1,59.8,61.0,326,3.89,3.84,2.31,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,3,56.9,65.0,327,4.05,4.07,2.31,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.29,1,62.4,58.0,334,4.2,4.23,2.63,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,3,63.3,58.0,335,4.34,4.35,2.75,0,0,...,0,1,0,0,0,1,0,0,0,0


In [31]:
X=df_encoded.drop('price',axis=1)
y=df_encoded['price']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
scaler=StandardScaler()
scaler.fit(X_train,y_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [49]:
RF=RandomForestRegressor()

In [51]:
RF.fit(X_train,y_train)

In [53]:
y_pred_RF=RF.predict(X_test)

In [55]:
print("R^2:",metrics.r2_score(y_test, y_pred_RF))
print("Adjusted R^2:",1 - (1-metrics.r2_score(y_test, y_pred_RF))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(y_test, y_pred_RF))
print("MSE:",metrics.mean_squared_error(y_test, y_pred_RF))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred_RF)))

R^2: 0.979872488569174
Adjusted R^2: 0.9798279944723615
MAE: 272.64889889048015
MSE: 316135.3610143219
RMSE: 562.2591582307236
