# итоговая моделька

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.experimental import enable_iterative_imputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (MinMaxScaler, StandardScaler, RobustScaler, Normalizer, PolynomialFeatures, 
                                   OneHotEncoder, OrdinalEncoder, LabelEncoder, QuantileTransformer, PowerTransformer)
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, 
                                     StratifiedShuffleSplit, RepeatedStratifiedKFold, validation_curve, 
                                     cross_validate, RandomizedSearchCV)
from sklearn.linear_model import (LogisticRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LinearRegression)
from sklearn.neighbors import (KNeighborsClassifier, KNeighborsRegressor)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.impute import (SimpleImputer, IterativeImputer, KNNImputer)
from sklearn.compose import (ColumnTransformer, make_column_transformer, TransformedTargetRegressor)
from sklearn.metrics import (accuracy_score, precision_score, recall_score, make_scorer, f1_score, 
                             classification_report, confusion_matrix, roc_curve, precision_recall_curve, 
                             RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay, 
                             balanced_accuracy_score, average_precision_score, mean_absolute_percentage_error, 
                             mean_squared_error, r2_score)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.feature_selection import (SelectKBest, f_regression)
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.feature_selection import SelectFromModel


In [3]:
path_train = 'train.csv'
df = pd.read_csv(path_train)

In [4]:
df = df.drop_duplicates()

In [5]:
df.loc[df['x'] <= 0, 'x'] = np.nan
df.loc[df['y'] <= 0, 'y'] = np.nan
df.loc[df['z'] <= 0, 'z'] = np.nan
df.loc[df['carat'] <= 0, 'carat'] = np.nan
df.loc[df['depth'] <= 0, 'depth'] = np.nan
df.loc[df['table'] <= 0, 'table'] = np.nan

In [6]:
df.loc[df['x'] > 20, 'x'] /= 10
df.loc[df['y'] > 20, 'y'] /= 10
df.loc[df['z'] > 20, 'z'] /= 10
df['depth'] = round(2 * df['z'] / (df['x'] + df['y']) * 100, 1)

In [7]:
def irq_mse(y_true, y_pred):
    delta = y_pred - y_true
    quant_75 = np.quantile(delta, 0.75)
    quant_25 = np.quantile(delta, 0.25)
    irq = quant_75 - quant_25
    mask = (delta < quant_25 - 1.5*irq) | (delta > quant_75 + 1.5*irq)
    
    if mask.sum() == 0:
        irq_mse = 0
    else:
        delta_new = delta[mask]**2
        irq_mse = -1/len(delta_new) * np.sum(delta_new)
    return irq_mse

score_irq_mse = make_scorer(irq_mse)
score_irq_mse

make_scorer(irq_mse, response_method='predict')

In [8]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

In [9]:
cat_features = ['cut', 'color', 'clarity']
axis_features = ['x', 'y', 'z', 'carat']

axis_transformer = Pipeline(steps=[
    ('missing_num', IterativeImputer(missing_values=np.nan, max_iter=20)),
    ('polynom', PolynomialFeatures(2, include_bias=False)),
    ('scaler', RobustScaler())
])

cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

CT = ColumnTransformer([
    ("axis_transformer", axis_transformer, axis_features),
    ("cat", cat, cat_features)
]).set_output(transform='pandas')

pipeline = Pipeline(steps=[
    ('preproc', CT),
    ('estimator', SVR())
])

model_target = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=PowerTransformer(method='yeo-johnson')
)
model_target.fit(X_train, y_train)




In [10]:
y_pred = model_target.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(irq_mse(y_test, y_pred))

Mean Squared Error (MSE): 319059.73
-1754903.4020179112


In [11]:
df_test = pd.read_csv('test.csv')

In [12]:
# Заменяем значения на NaN для невалидных данных
df_test.loc[df_test['x'] <= 0, 'x'] = np.nan
df_test.loc[df_test['y'] <= 0, 'y'] = np.nan
df_test.loc[df_test['z'] <= 0, 'z'] = np.nan
df_test.loc[df_test['carat'] <= 0, 'carat'] = np.nan
df_test.loc[df_test['depth'] <= 0, 'depth'] = np.nan
df_test.loc[df_test['table'] <= 0, 'table'] = np.nan

# Корректируем слишком большие значения
df_test.loc[df_test['x'] > 20, 'x'] /= 10
df_test.loc[df_test['y'] > 20, 'y'] /= 10
df_test.loc[df_test['z'] > 20, 'z'] /= 10

# Пересчитываем глубину
df_test['depth'] = round(2 * df_test['z'] / (df_test['x'] + df_test['y']) * 100, 1)


In [13]:
# sns.pairplot(df_test)