In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from collections import Counter
import matplotlib.colors as mcolors

from sklearn.experimental import enable_iterative_imputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (MinMaxScaler, StandardScaler, RobustScaler, Normalizer, PolynomialFeatures, 
                                   OneHotEncoder, OrdinalEncoder, LabelEncoder, QuantileTransformer, PowerTransformer)
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, 
                                     StratifiedShuffleSplit, RepeatedStratifiedKFold, validation_curve, 
                                     cross_validate, RandomizedSearchCV)
from sklearn.linear_model import (LogisticRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LinearRegression)
from sklearn.neighbors import (KNeighborsClassifier, KNeighborsRegressor)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.impute import (SimpleImputer, IterativeImputer, KNNImputer)
from sklearn.compose import (ColumnTransformer, make_column_transformer, TransformedTargetRegressor)
from sklearn.metrics import (accuracy_score, precision_score, recall_score, make_scorer, f1_score, 
                             classification_report, confusion_matrix, roc_curve, precision_recall_curve, 
                             RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay, 
                             balanced_accuracy_score, average_precision_score, mean_absolute_percentage_error, 
                             mean_squared_error, r2_score)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.feature_selection import (SelectKBest, f_regression)
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.feature_selection import SelectFromModel


In [2]:
path_train = 'train.csv'

In [3]:
df = pd.read_csv(path_train)

In [4]:
df.duplicated().sum() # Колво дубликатов
df = df.drop_duplicates()

In [5]:
df.loc[df['x'] <= 0, 'x'] = np.nan
df.loc[df['y'] <= 0, 'y'] = np.nan
df.loc[df['z'] <= 0, 'z'] = np.nan
df.loc[df['carat'] <= 0, 'carat'] = np.nan
df.loc[df['depth'] <= 0, 'depth'] = np.nan
df.loc[df['table'] <= 0, 'table'] = np.nan

In [6]:
df.loc[df['x'] > 20, 'x'] /= 10
df.loc[df['y'] > 20, 'y'] /= 10
df.loc[df['z'] > 20, 'z'] /= 10

In [7]:
df['depth'] = round(2 * df['z'] / (df['x'] + df['y']) * 100, 1)

In [8]:
# df

In [9]:
def irq_mse(y_true, y_pred):
    delta = y_pred - y_true
    quant_75 = np.quantile(delta, 0.75)
    quant_25 = np.quantile(delta, 0.25)
    irq = quant_75 - quant_25
    mask = (delta < quant_25 - 1.5*irq) | (delta > quant_75 + 1.5*irq)
    
    if mask.sum() == 0:
        irq_mse = 0
    else:
        delta_new = delta[mask]**2
        irq_mse = -1/len(delta_new) * np.sum(delta_new)
    return irq_mse

score_irq_mse = make_scorer(irq_mse)
score_irq_mse

make_scorer(irq_mse, response_method='predict')

In [10]:
def classify_columns(df):
    numeric_columns = []
    categorical_columns = []

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            numeric_columns.append(column)
        else:
            categorical_columns.append(column)

    return numeric_columns, categorical_columns

numeric_columns, categorical_columns = classify_columns(df)

print("Числовые колонки:", numeric_columns)
print("Категориальные колонки:", categorical_columns)

Числовые колонки: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
Категориальные колонки: ['cut', 'color', 'clarity']


In [11]:
# df = df.drop(['depth', 'table'], axis=1)

In [12]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

## Select K-best

**Фич без обработки num всего 34**

In [15]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(2, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# # num = Pipeline(steps=[
# #     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
# #     ('scaler', RobustScaler())
# # ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     # ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),
#     ('feature_selection', SelectKBest(score_func=f_regression, k=34)), 
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

# display(model_target)

In [16]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

**убавление фич не помогает улучшить метрику \
Уберем select k best**

## Основная выбранная модель

In [19]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(2, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# num = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('scaler', RobustScaler())
# ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

In [20]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

## PCA 

In [22]:
# from sklearn.decomposition import PCA


In [23]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(2, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# # num = Pipeline(steps=[
# #     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
# #     ('scaler', RobustScaler())
# # ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     # ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),                    
#     ('pca', PCA(n_components=33)),       
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

In [24]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

**неа**

## L1-based feature selection (Lasso)

In [27]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(2, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# # num = Pipeline(steps=[
# #     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
# #     ('scaler', RobustScaler())
# # ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     # ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),
#     ('feature_selection', SelectFromModel(Lasso(alpha=0.1))),  # выбираем значимые признаки с Lasso
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

In [28]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

**оч плохо**

## Mutual info

In [31]:
# cat_features = ['cut', 'color', 'clarity']
# axis_features = ['x', 'y', 'z', 'carat']
# num_features = ['depth', 'table']

# axis_transformer = Pipeline(steps=[
#     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
#     ('polynom', PolynomialFeatures(3, include_bias=False)),
#     ('scaler', RobustScaler())
# ])

# # num = Pipeline(steps=[
# #     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
# #     ('scaler', RobustScaler())
# # ])

# cat = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
# ])

# CT = ColumnTransformer([
#     ("axis_transformer", axis_transformer, axis_features),
#     # ("num", num, num_features),
#     ("cat", cat, cat_features)
# ]).set_output(transform='pandas')

# pipeline = Pipeline(steps=[
#     ('preproc', CT),
#     ('feature_selection', SelectKBest(mutual_info_regression, k=33)),  
#     ('estimator', SVR())
# ])

# model_target = TransformedTargetRegressor(
#     regressor=pipeline,
#     transformer=PowerTransformer(method='yeo-johnson')
# )

In [32]:
# model_target.fit(X_train, y_train)
# y_pred = model_target.predict(X_test)

# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE): {mse:.2f}")
# print(irq_mse(y_test, y_pred))

**Далее проверю гипотезу, что если создать полином 3 степени и отобрать признаки \
Не сработало**

In [34]:
# sns.pairplot(df)

In [35]:
# Видим что нужно еще обработать table
# и x, y, z + carat 

In [36]:
# plt.figure(figsize=(6, 4))
# sns.scatterplot(x='carat', y='y', data=df)
# plt.show()

## Доработка

**1) Здесь была идея реализовать поиск похожих строк выбивающихся из общего тренда совокупности и далее принимать решение какую фичу из пары нужно менять что б строка встала в общую совокупность \
Однако времени уже нет \
Возможно стоило аккуратно построить модели которые будут пресказывать адекватные значения x, y, z, carat на такие точки**

**2) Провести детальный разбор ошибок модели**

In [40]:
lst_mse = []
lst_irq_mse = []
for i in range(50, 55):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)
    cat_features = ['cut', 'color', 'clarity']
    axis_features = ['x', 'y', 'z', 'carat']
    num_features = ['depth', 'table']
    axis_transformer = Pipeline(steps=[
        ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
        ('polynom', PolynomialFeatures(2, include_bias=False)),
        ('scaler', RobustScaler())
    ])

    # num = Pipeline(steps=[
    #     ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
    #     ('scaler', RobustScaler())
    # ])

    cat = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])

    CT = ColumnTransformer([
        ("axis_transformer", axis_transformer, axis_features),
        # ("num", num, num_features),
        ("cat", cat, cat_features)
    ]).set_output(transform='pandas')

    pipeline = Pipeline(steps=[
        ('preproc', CT),
        ('estimator', SVR())
    ])
    
    model_target = TransformedTargetRegressor(
        regressor=pipeline,
        transformer=PowerTransformer(method='yeo-johnson')
    )

    model_target.fit(X_train, y_train)
    y_pred = model_target.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    lst_mse.append(mse)
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    irq = irq_mse(y_test, y_pred)
    lst_irq_mse.append(irq)
    print(irq_mse(y_test, y_pred))

 



Mean Squared Error (MSE): 354617.86
-1938410.7911721717




Mean Squared Error (MSE): 379495.36
-2087163.9604078943




Mean Squared Error (MSE): 324109.58
-1740690.2367330913




Mean Squared Error (MSE): 378038.54
-2091621.372954791




Mean Squared Error (MSE): 346113.40
-1877837.265484561


In [41]:
np.mean(lst_mse)

356474.95066289336

In [42]:
np.mean(lst_irq_mse)

-1947144.7253505017

In [43]:
# Итоговая модель (с закоменченым num) то есть признаки num будут удалены
# -1826684.715572456
# 331143.7409430638

# Без комммента нам: 
# хуже на две сотки где то