## Обработка датасета

In [92]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import RobustScaler

In [93]:
df=pd.read_csv('/kaggle/input/diamonds-regression/diamonds_train.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.51,Good,D,SI2,63.9,55.0,1180,5.04,5.1,3.24
1,0.72,Ideal,E,VS2,60.8,57.0,3091,5.79,5.82,3.53
2,0.7,Very Good,D,VVS2,62.8,60.0,4022,5.65,5.69,3.56
3,0.36,Ideal,D,SI1,61.2,57.0,663,4.59,4.63,2.82
4,0.54,Very Good,D,SI1,60.0,59.8,1593,5.3,5.34,3.18


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43018 entries, 0 to 43017
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43018 non-null  float64
 1   cut      43018 non-null  object 
 2   color    43018 non-null  object 
 3   clarity  43018 non-null  object 
 4   depth    43018 non-null  float64
 5   table    43018 non-null  float64
 6   price    43018 non-null  int64  
 7   x        43018 non-null  float64
 8   y        43018 non-null  float64
 9   z        43018 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.3+ MB


In [95]:
target = df['price']

In [96]:
df['cut'].value_counts()

cut
Ideal        17189
Premium      10993
Very Good     9652
Good          3903
Fair          1281
Name: count, dtype: int64

In [97]:
df['color'].value_counts()

color
G    9009
E    7853
F    7608
H    6627
D    5407
I    4318
J    2196
Name: count, dtype: int64

In [98]:
df['clarity'].value_counts()

clarity
SI1     10445
VS2      9779
SI2      7284
VS1      6560
VVS2     4031
VVS1     2919
IF       1419
I1        581
Name: count, dtype: int64

In [99]:
def data_preprocess(df):
    try:
        df = df.drop(['depth', 'table', 'price'], axis=1) #Уберём поля - линейные комбинации других полей, чтобы избежать переобучения
    except:
        df = df.drop(['depth', 'table'], axis=1)
    cut_codes = {'Ideal': 0, 'Premium': 1, 'Very Good': 2, 'Good': 3, 'Fair': 4}
    color_codes = {'D': 0, 'E': 1, 'F': 2, 'G': 3, 'H': 4, 'I': 5, 'J': 6}
    clarity_codes = {'IF': 0, 'VVS1': 1, 'VVS2': 2, 'VS1': 3, 'VS2': 4, 'SI1': 5, 'SI2': 6, 'I1': 7}
    df['cut'] = [cut_codes[i] for i in df['cut']]
    df['color'] = [color_codes[i] for i in df['color']]
    df['clarity'] = [clarity_codes[i] for i in df['clarity']]
    rs = RobustScaler()
    df[df.columns]=rs.fit_transform(df[df.columns])
    return df

In [100]:
df_processed = data_preprocess(df)

In [101]:
df_processed.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z
0,-0.296875,1.0,-1.0,1.0,-0.362637,-0.337017,-0.258929
1,0.03125,-0.5,-0.666667,0.0,0.049451,0.060773,0.0
2,0.0,0.5,-1.0,-1.0,-0.027473,-0.01105,0.026786
3,-0.53125,-0.5,-1.0,0.5,-0.60989,-0.596685,-0.633929
4,-0.25,0.5,-1.0,0.5,-0.21978,-0.20442,-0.3125


## Модель

In [102]:
X=df_processed.values
y=target.values

In [103]:
from sklearn.ensemble import RandomForestRegressor

In [104]:
model = RandomForestRegressor(n_estimators=800, max_depth=None,  max_features = 0.7)

In [105]:
model.fit(X, y)

## Тест

In [110]:
df_test=pd.read_csv('/kaggle/input/diamonds-regression/diamonds_test.csv')

In [111]:
ids = df_test['id']
df_test = df_test.drop(['id'], axis = 1)

In [112]:
df_test_processed = data_preprocess(df_test)

In [113]:
df_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,x,y,z
0,0.492308,1.0,-0.333333,1.0,0.440217,0.461957,0.300885
1,0.0,0.5,0.666667,-1.5,0.043478,0.054348,-0.070796
2,-0.584615,0.5,0.333333,-1.0,-0.722826,-0.744565,-0.681416
3,-0.430769,-0.5,-0.333333,-1.0,-0.494565,-0.483696,-0.477876
4,-0.461538,-0.5,-0.333333,0.0,-0.521739,-0.51087,-0.504425


In [114]:
result = model.predict(df_test_processed)



In [115]:
pd_ans = pd.read_csv('/kaggle/input/diamonds-regression/example_answer.csv')

In [119]:
pd_ans['price'] = result

In [120]:
pd_ans.to_csv("sub.csv", index=False)
pd_ans.head()

Unnamed: 0,id,price
0,0,3972.785833
1,1,2454.535
2,2,712.679062
3,3,1171.38625
4,4,858.5425
