In [None]:
from google.colab import drive; drive.mount('/content/drive')   # OK to enable, if your kaggle.json is stored in Google Drive

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.pipeline import Pipeline

from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns, os, tqdm, re, sys, cv2, skimage, time


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%matplotlib inline
plt.style.use('ggplot')

In [None]:
file_path = 'my path to json file from kaggle in Google Disk'
df = pd.read_csv(file_path); df

In [None]:
df_num_cols = df.select_dtypes(include=['float64']).columns

plt.figure(figsize=(20, 10))

for i, col in enumerate(df_num_cols, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[col].dropna(), kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show();

df.describe().T

In [None]:
df.price.info()

# Feature Engineering.

In [None]:
df_onehot = df.copy()

df_onehot = pd.get_dummies(df_onehot, columns=['color'], drop_first=True)*1
df_onehot = pd.get_dummies(df_onehot, columns=['clarity'], drop_first=True)*1
df_onehot = pd.get_dummies(df_onehot, columns=['cut'], drop_first=True)*1

df_onehot['y_log'] = np.log1p(df_onehot['y'])
df_onehot['x_log'] = np.log1p(df_onehot['x'])
df_onehot['z_log'] = np.log1p(df_onehot['z'])

df_onehot['carat_log'] = np.log1p(df_onehot['carat'])
df_onehot['depth_log'] = np.log1p(df_onehot['depth'])
df_onehot['table_log'] = np.log1p(df_onehot['table'])

df_onehot['proportion'] = df_onehot['depth'] / df_onehot['table']
df_onehot['carat * proportion'] = df_onehot['carat'] * df_onehot['proportion']

df_onehot['carat_squared'] = df_onehot['carat'] ** 2
df_onehot['carat_table'] = df_onehot['carat'] * df_onehot['table']

for color in ['color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J']:
    df_onehot[f'carat_{color}'] = df_onehot['carat'] * df_onehot[color]

for clarity in ['clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
                'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']:
    df_onehot[f'carat_{clarity}'] = df_onehot['carat'] * df_onehot[clarity]

for cut in ['cut_G', 'cut_I', 'cut_P', 'cut_V']:
    df_onehot[f'carat_{cut}'] = df_onehot['carat'] * df_onehot[cut]

df2 = df_onehot.copy()

In [None]:
df2.columns

In [None]:
vX = df2.query('price!=price').drop(['price'], axis=1)  # slice a test sample
tXY = df2.query('price==price')                       # slice training sample
tX, tY = tXY.drop(columns=['price'], axis=1), tXY.price

In [None]:
print(f'обучаяющая - {len(tXY)}, тестовая - {len(vX)}')

Сначала попробуем разделить на обучающую и валиадционную выборки и посмотрим результаты.

In [None]:
train_X, val_X, train_Y, val_Y = train_test_split(
    tX, tY, test_size=0.2, random_state=42
)

LR

In [None]:
poly = PolynomialFeatures(degree=2, include_bias=True)
train_X_poly = poly.fit_transform(train_X)
val_X_poly = poly.transform(val_X)
vX_poly = poly.transform(vX)

# 8.3: Стандартизация признаков
scaler = RobustScaler()
train_X_scaled = scaler.fit_transform(train_X_poly)
val_X_scaled = scaler.transform(val_X_poly)
vX_scaled = scaler.transform(vX_poly)

In [None]:
model = LinearRegression()
model.fit(train_X_scaled, train_Y)
val_pred = model.predict(val_X_scaled)

# 10.2: Вычисление MAE на валидационной выборке
mae = mean_absolute_error(val_Y, val_pred)
print(f"Mean Absolute Error (MAE) на валидационной выборке: {mae:.2f}")

train_pred = model.predict(train_X_scaled)

# 10.2: Вычисление MAE на обучающей выборке
mae_train = mean_absolute_error(train_Y, train_pred)
print(f"Mean Absolute Error (MAE) на обучающей выборке: {mae_train:.2f}")

Ridge

In [None]:
ridge_model = Ridge(alpha=0.4)
ridge_model.fit(train_X_scaled, train_Y)

# Предсказание и оценка
train_pred_ridge = ridge_model.predict(train_X_scaled)
val_pred_ridge = ridge_model.predict(val_X_scaled)

mae_train_ridge = mean_absolute_error(train_Y, train_pred_ridge)
mae_val_ridge = mean_absolute_error(val_Y, val_pred_ridge)

print(f"Ridge Regression MAE на валидационной выборке: {mae_val_ridge:.2f}")
print(f"Ridge Regression MAE на обучающей выборке: {mae_train_ridge:.2f}")

Но лучше делать эти проверки с помощью кросс-валидации.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=0.01))
])

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

from sklearn.metrics import make_scorer, mean_absolute_error
import numpy as np

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

scores = cross_val_score(pipeline, tX, tY, cv=kf, scoring='neg_mean_absolute_error', n_jobs=-1)

mae_scores = -scores

print(f"MAE по фолдам: {mae_scores}")
print(f"Среднее MAE: {mae_scores.mean():.2f}")
print(f"Стандартное отклонение MAE: {mae_scores.std():.2f}")

Соответственно тут уже обучаем и делаем предикт на тесте.
- 1) Используем полиномы со второй степенью (если больше то все рушится).
- 2) Вместо StandardScaler использовал Robust так как он менее чувствителен к выбросам, результат выдал мне лучше.

In [None]:
poly_full = PolynomialFeatures(degree=2, include_bias=False)
tX_poly_full = poly_full.fit_transform(tX)
vX_poly_full = poly_full.transform(vX)

robust_scaler = RobustScaler()
tX_robust = robust_scaler.fit_transform(tX_poly_full)
vX_robust = robust_scaler.transform(vX_poly_full)

С помощью гридсерча нахожу оптимальный параметр альфа для нашего трансформированной обучающей выборки

In [None]:
param_grid = {
    'alpha': [0.01, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 5.0, 10.0]
}

ridge_model = Ridge()
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(tX_robust, tY)

print(f"Best alpha: {grid_search.best_params_['alpha']}")

Делаем предикт

In [None]:
final_model = Ridge(alpha=0.01) # альфа, который выдал greedsearch
final_model.fit(tX_robust, tY)

v_pred = final_model.predict(vX_robust)

In [None]:
def get_price_bounds(carat_value):
    if carat_value < 0.3:
        min_price = tY[tX['carat'] < 0.3].min()
        max_price = tY[tX['carat'] < 0.3].max()
    elif carat_value < 0.5:
        min_price = tY[(tX['carat'] >= 0.3) & (tX['carat'] < 0.5)].min()
        max_price = tY[(tX['carat'] >= 0.3) & (tX['carat'] < 0.5)].max()
    elif carat_value < 0.7:
        min_price = tY[(tX['carat'] >= 0.5) & (tX['carat'] < 0.7)].min()
        max_price = tY[(tX['carat'] >= 0.5) & (tX['carat'] < 0.7)].max()
    elif carat_value < 1.0:
        min_price = tY[(tX['carat'] >= 0.7) & (tX['carat'] < 1.0)].min()
        max_price = tY[(tX['carat'] >= 0.7) & (tX['carat'] < 1.0)].max()
    elif carat_value < 2.0:
        min_price = tY[(tX['carat'] >= 1.0) & (tX['carat'] < 2.0)].min()
        max_price = tY[(tX['carat'] >= 1.0) & (tX['carat'] < 2.0)].max()
    else:
        min_price = tY[tX['carat'] >= 2.0].min()
        max_price = tY[tX['carat'] >= 2.0].max()
    return min_price, max_price

v_pred_clipped = []
for i, pred in enumerate(v_pred):
    carat_value = vX['carat'].iloc[i]
    min_bound, max_bound = get_price_bounds(carat_value)
    clipped_pred = np.clip(pred, min_bound, max_bound)
    v_pred_clipped.append(clipped_pred)

v_pred_clipped = np.array(v_pred_clipped)

In [None]:
output = pd.DataFrame({
    'id': range(1, len(v_pred_clipped) + 1),
    'price': v_pred_clipped
})
output.head()

Готовый csv для выгрузки на kaggle.

In [None]:
output.to_csv('predict.csv', index=False)