In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
import joblib
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/kaggle/input/linear-regression-apu/diamonds_train.csv')
df.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.51,Good,D,SI2,63.9,55.0,1180,5.04,5.1,3.24
1,0.72,Ideal,E,VS2,60.8,57.0,3091,5.79,5.82,3.53
2,0.7,Very Good,D,VVS2,62.8,60.0,4022,5.65,5.69,3.56
3,0.36,Ideal,D,SI1,61.2,57.0,663,4.59,4.63,2.82
4,0.54,Very Good,D,SI1,60.0,59.8,1593,5.3,5.34,3.18


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43018 entries, 0 to 43017
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43018 non-null  float64
 1   cut      43018 non-null  object 
 2   color    43018 non-null  object 
 3   clarity  43018 non-null  object 
 4   depth    43018 non-null  float64
 5   table    43018 non-null  float64
 6   price    43018 non-null  int64  
 7   x        43018 non-null  float64
 8   y        43018 non-null  float64
 9   z        43018 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.3+ MB


### ПРЕДВАРИТЕЛЬНАЯ ОБРАБОТКА ДАННЫХ

In [7]:
missing = df.isnull().sum()
display(missing[missing > 0] if missing.sum() > 0 else "Пропущенных значений нет")

'Пропущенных значений нет'

In [8]:
categorical_cols = ['cut', 'color', 'clarity']
for col in categorical_cols:
    if col in df.columns:
        print(f"{col}: {sorted(df[col].unique())}")

cut: ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
color: ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity: ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']


In [9]:
categorical_cols = ['cut', 'color', 'clarity']
df_encoded = pd.get_dummies(df, columns=categorical_cols)

### Подготовка признаков

In [10]:
feature_columns = [col for col in df_encoded.columns if col != 'price']
print(f"Признаки для обучения ({len(feature_columns)}): {feature_columns}")

Признаки для обучения (26): ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']


In [11]:
X = df_encoded[feature_columns]
y = df_encoded['price']
print(f"Размеры: X {X.shape}, y {y.shape}")
print(f"Статистика цены - среднее: ${y.mean():,.2f}, медиана: ${y.median():,.2f}")

Размеры: X (43018, 26), y (43018,)
Статистика цены - среднее: $3,929.50, медиана: $2,401.00


### Разделение на train/test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)
print(f"Train: {X_train.shape[0]:,} образцов ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]:,} образцов ({X_test.shape[0]/len(X)*100:.1f}%)")


Train: 34,414 образцов (80.0%)
Test: 8,604 образцов (20.0%)


### Нормализация и стандартизация данных

In [13]:
# Этап 1: Нормализация (приводим к диапазону [0,1])
minmax_scaler = MinMaxScaler()
X_train_norm = minmax_scaler.fit_transform(X_train)
X_test_norm = minmax_scaler.transform(X_test)

# Этап 2: Стандартизация нормализованных данных (μ=0, σ=1)
standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train_norm)
X_test_scaled = standard_scaler.transform(X_test_norm)

### Обучение модели

In [14]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [15]:
# Предсказания
y_train_pred_rf = rf_model.predict(X_train_scaled)
y_test_pred_rf = rf_model.predict(X_test_scaled)

In [16]:
# Основные метрики
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

print("МЕТРИКИ КАЧЕСТВА (Random Forest):")
print(f"R² Score - Train: {train_r2_rf:.4f}, Test: {test_r2_rf:.4f}")
print(f"RMSE - Train: ${train_rmse_rf:,.2f}, Test: ${test_rmse_rf:,.2f}")

# Кросс-валидация
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='r2')
print(f"Кросс-валидация R² (5-fold): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

МЕТРИКИ КАЧЕСТВА (Random Forest):
R² Score - Train: 0.9973, Test: 0.9781
RMSE - Train: $205.59, Test: $595.37
Кросс-валидация R² (5-fold): 0.9805 (+/- 0.0017)


### Финальная модель для соревнований

In [17]:
# Шаг 1: Нормализация
minmax_scaler = MinMaxScaler()
X_norm = minmax_scaler.fit_transform(X)
# Шаг 2: Стандартизация
standard_scaler = StandardScaler()
X_scaled = standard_scaler.fit_transform(X_norm)

final_model = RandomForestRegressor(n_estimators=100, random_state=42)
final_model.fit(X_scaled, y)

In [18]:
# Оценка модели
predictions = final_model.predict(X_scaled)
final_r2 = r2_score(y, predictions)
final_rmse = np.sqrt(mean_squared_error(y, predictions))
print(f"R² на всех данных: {final_r2:.4f}")
print(f"RMSE на всех данных: ${final_rmse:,.2f}")

# Сохранение всех компонентов
joblib.dump(final_model, 'final_model.pkl')
joblib.dump(minmax_scaler, 'minmax_scaler.pkl')
joblib.dump(standard_scaler, 'standard_scaler.pkl')

R² на всех данных: 0.9973
RMSE на всех данных: $206.60


['standard_scaler.pkl']


### Тестирование и сохранения файла

In [20]:
df_test = pd.read_csv('/kaggle/input/linear-regression-apu/diamonds_test.csv')
print(df_test.head())

   id  carat        cut color clarity  depth  table     x     y     z
0   0   1.02       Good     F     SI2   59.2   58.0  6.51  6.56  3.87
1   1   0.70  Very Good     I    VVS1   59.5   58.0  5.78  5.81  3.45
2   2   0.32  Very Good     H    VVS2   63.4   56.0  4.37  4.34  2.76
3   3   0.42      Ideal     F    VVS2   62.2   56.0  4.79  4.82  2.99
4   4   0.40      Ideal     F     VS2   62.3   54.0  4.74  4.77  2.96


In [22]:
test_ids = df_test['id'].copy()
df_test_encoded = pd.get_dummies(df_test, columns=categorical_cols)

X_test = df_test_encoded[feature_columns]

In [23]:
# 1. нормализация (MinMaxScaler)
X_test_norm = minmax_scaler.transform(X_test)
# 2. стандартизация (StandardScaler)
X_test_scaled = standard_scaler.transform(X_test_norm)

In [24]:
# Предсказываем
predictions = final_model.predict(X_test_scaled)

In [25]:
df_result = pd.DataFrame({'id': df_test['id'], 'price': predictions})
df_result.to_csv('submission.csv', index=False)