In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from collections import Counter
import matplotlib.colors as mcolors

from sklearn.experimental import enable_iterative_imputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (MinMaxScaler, StandardScaler, RobustScaler, Normalizer, PolynomialFeatures, 
                                   OneHotEncoder, OrdinalEncoder, LabelEncoder, QuantileTransformer, PowerTransformer)
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, 
                                     StratifiedShuffleSplit, RepeatedStratifiedKFold, validation_curve, 
                                     cross_validate, RandomizedSearchCV)
from sklearn.linear_model import (LogisticRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LinearRegression)
from sklearn.neighbors import (KNeighborsClassifier, KNeighborsRegressor)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.impute import (SimpleImputer, IterativeImputer, KNNImputer)
from sklearn.compose import (ColumnTransformer, make_column_transformer, TransformedTargetRegressor)
from sklearn.metrics import (accuracy_score, precision_score, recall_score, make_scorer, f1_score, 
                             classification_report, confusion_matrix, roc_curve, precision_recall_curve, 
                             RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay, 
                             balanced_accuracy_score, average_precision_score, mean_absolute_percentage_error, 
                             mean_squared_error, r2_score)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.feature_selection import (SelectKBest, f_regression)

In [2]:
path_train = 'train.csv'

In [3]:
df = pd.read_csv(path_train)

In [4]:
df.duplicated().sum() # Колво дубликатов
df = df.drop_duplicates()

In [5]:
df.loc[df['x'] <= 0, 'x'] = np.nan
df.loc[df['y'] <= 0, 'y'] = np.nan
df.loc[df['z'] <= 0, 'z'] = np.nan
df.loc[df['carat'] <= 0, 'carat'] = np.nan
df.loc[df['depth'] <= 0, 'depth'] = np.nan
df.loc[df['table'] <= 0, 'table'] = np.nan

In [6]:
df.loc[df['x'] > 20, 'x'] /= 10
df.loc[df['y'] > 20, 'y'] /= 10
df.loc[df['z'] > 20, 'z'] /= 10

In [7]:
def irq_mse(y_true, y_pred):
    delta = y_pred - y_true
    quant_75 = np.quantile(delta, 0.75)
    quant_25 = np.quantile(delta, 0.25)
    irq = quant_75 - quant_25
    mask = (delta < quant_25 - 1.5*irq) | (delta > quant_75 + 1.5*irq)
    
    if mask.sum() == 0:
        irq_mse = 0
    else:
        delta_new = delta[mask]**2
        irq_mse = -1/len(delta_new) * np.sum(delta_new)
    return irq_mse

score_irq_mse = make_scorer(irq_mse)
score_irq_mse

make_scorer(irq_mse, response_method='predict')

In [8]:
def classify_columns(df):
    numeric_columns = []
    categorical_columns = []

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            numeric_columns.append(column)
        else:
            categorical_columns.append(column)

    return numeric_columns, categorical_columns

numeric_columns, categorical_columns = classify_columns(df)

print("Числовые колонки:", numeric_columns)
print("Категориальные колонки:", categorical_columns)

Числовые колонки: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
Категориальные колонки: ['cut', 'color', 'clarity']


In [9]:
# X = df.drop('price', axis=1)
# y = df['price']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
cat_features = ['cut', 'color', 'clarity']
axis_features = ['x', 'y', 'z', 'carat']
num_features = ['depth', 'table']

axis_transformer = Pipeline(steps=[
    ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
    ('polynom', PolynomialFeatures(2, include_bias=False)),
    ('scaler', RobustScaler())
])

# num = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('scaler', RobustScaler())
# ])

cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

CT = ColumnTransformer([
    ("axis_transformer", axis_transformer, axis_features),
    # ("num", num, num_features),
    ("cat", cat, cat_features)
]).set_output(transform='pandas')

pipeline = Pipeline(steps=[
    ('preproc', CT),
    ('feature_selection', SelectKBest(score_func=f_regression, k=34)), 
    ('estimator', SVR())
])

model_target = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=PowerTransformer(method='yeo-johnson')
)

display(model_target)

In [18]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [20]:
model_target.fit(X_train, y_train)
y_pred = model_target.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(irq_mse(y_test, y_pred))



Mean Squared Error (MSE): 322542.77
-1782092.8248439764


In [21]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(2, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# num = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('scaler', RobustScaler())
# ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),
#     ('feature_selection', SelectKBest(score_func=f_regression, k=35)),
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

# display(model_target)

In [22]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

## Модель без num

In [24]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [25]:
cat_features = ['cut', 'color', 'clarity']
axis_features = ['x', 'y', 'z', 'carat']
num_features = ['depth', 'table']

axis_transformer = Pipeline(steps=[
    ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=30)),
    ('polynom', PolynomialFeatures(2, include_bias=False)),
    ('scaler', RobustScaler())
])

# num = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('scaler', RobustScaler())
# ])

cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

CT = ColumnTransformer([
    ("axis_transformer", axis_transformer, axis_features),
    # ("num", num, num_features),
    ("cat", cat, cat_features)
]).set_output(transform='pandas')

pipeline = Pipeline(steps=[
    ('preproc', CT),
    ('feature_selection', SelectKBest(score_func=f_regression, k=40)), 
    ('estimator', SVR())
])

model_target = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=PowerTransformer(method='yeo-johnson')
)

display(model_target)

In [26]:
model_target.fit(X_train, y_train)
y_pred = model_target.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(irq_mse(y_test, y_pred))



Mean Squared Error (MSE): 322916.80
-1792125.5101441147


**Модель без обработки num оказывается постоянно лучше, значит обработка этих колонок возможно добавляет ненужный шум в данные**