# CFG

In [136]:
class CFG:
    """
    Configuration class holding constants and configurations for data preprocessing.

    Attributes:
        PATH (str): Path to the dataset file.
        TEST_SPLIT (float): Ratio for splitting the dataset into training and testing sets.
        TARGET (str): The name of the target variable in the dataset.
        SELECTED_COLUMNS (list of str): List of column names to be selected for processing.
    """
    SAVE_PATH = 'ver1' # path to save
    LOAD_PATH = 'ver1' # load from path  
    SAVE_METHOD =  'pickle'
    SEED = 42 
    PATH = 'csv/data.csv'
    TEST_SIZE = 0.2
    N_JOBS = -1
    CROSS_VALID = True
    N_SPLIT = 2
    BAGGING = True
    BAGGING_ESTIMATORS = 30
    TARGET = 'стоимость_за_кв_м_ob'
    SELECTED_COLUMNS = [
        'Кадастровый_номер_ob',
        'Общая_площадь_м2_ob', 'Жилая_площадь_м2_ob',
        'Этаж_ob',
        'Этажность_ob', 'Количество_комнат_ob', 'Год_постройки_ob',
        'Материал_стен_ob', 'Район_ob', 'Микрорайон_ob', 'Улица_ob', 'Дом_ob',
        'Квартира_ob', 'стоимость_за_кв_м_ob', 'дата_ob', "flat_1_cnt",
        "flat_2_cnt", "flat_3_cnt", "flat_4_cnt",
        "total_year_date_construction", "total_material_type",
        "total_heat_type", "total_has_electric", "total_floor_count",
        "total_hot_water_system", "total_heat_system", "total_district",
        "total_lift", 'to_metre', 'to_centre'
    ]
    
    int_columns = [
        'Этажность_ob',
        'Количество_комнат_ob',
        'Год_постройки_ob',
#         'Этаж_ob',
#         'Квартира_ob',  
    ]
    float_columns = [
        'total_year_date_construction',
        'стоимость_за_кв_м_ob',
        'Общая_площадь_м2_ob',
        'Жилая_площадь_м2_ob',
        'flat_1_cnt',
        'flat_2_cnt',
        'flat_3_cnt',
        'flat_4_cnt',
    #     'abaya',
        'to_metre',
        'to_centre',
    ]

# Set SEED

In [137]:
import numpy as np
import random
import os
# Preven randomnes
def seeding(SEED):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    print('seeding done!!!')
    
seeding(42)

seeding done!!!


# Preprocessing

## Preprocessing libs

In [138]:
from sklearn.preprocessing import PolynomialFeatures

# Mapper

# 1

In [139]:
from category_encoders import CatBoostEncoder

from sklearn.base import BaseEstimator, TransformerMixin

class GenerateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self  # Nothing to do here

    def transform(self, X):
        df = X.copy()
        
        new_features = pd.DataFrame(index=df.index)
        numerical_cols = df.select_dtypes(include=['number']).columns
        
        for col in numerical_cols:
    #         new_features[f"{col}_global_mean"] = [df[col].mean()]* len(df)
    #         new_features[f"{col}_global_std"] = [df[col].std()]* len(df)
    #         new_features[f"{col}_global_median"] = [df[col].median()]* len(df)
    #         new_features[f"{col}_global_diff"] = [df[col].diff()]* len(df)
            # Basic statistics
            new_features[f'{col}_min'] = [df[col].min()] * len(df)
            new_features[f'{col}_max'] = [df[col].max()] * len(df)
            new_features[f'{col}_range'] = [df[col].max() - df[col].min()] * len(df)
            new_features[f'{col}_sum'] = [df[col].sum()] * len(df)
            new_features[f'{col}_var'] = [df[col].var()] * len(df)
            new_features[f'{col}_coef_var'] = [df[col].std() / df[col].mean() if df[col].mean() != 0 else 0] * len(df)
            new_features[f'{col}_skew'] = [df[col].skew()] * len(df)
            new_features[f'{col}_kurt'] = [df[col].kurt()] * len(df)
    
            # Quantiles
            new_features[f'{col}_25%'] = [df[col].quantile(0.25)] * len(df)
            new_features[f'{col}_50%'] = [df[col].quantile(0.5)] * len(df)
            new_features[f'{col}_75%'] = [df[col].quantile(0.75)] * len(df)
    
            # Cumulative statistics
            new_features[f'{col}_cumsum'] = df[col].cumsum()
            new_features[f'{col}_cummax'] = df[col].cummax()
            new_features[f'{col}_cummin'] = df[col].cummin()
    
            # Exponential Moving Average
            new_features[f'{col}_ema'] = df[col].ewm(span=10, adjust=False).mean()
    
        # Concatenate the new features with the original DataFrame
        df = pd.concat([df, new_features], axis=1)
        return X

In [140]:
from feature_engine.selection import DropHighPSIFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, DropConstantFeatures

In [141]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from category_encoders import CatBoostEncoder
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# CustomTransformer

In [142]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np
import pandas as pd

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cont_cols = None
        self.target = None
        self.best_transformations = {}

    def fit(self, X, y=None):
        
        X_copy = X.copy()
        self.cont_cols = X_copy.columns
        self.target = y
        self.best_transformations = {}
        
        for col in self.cont_cols:
            best_auc = -1
            best_transformation = None

            # Log Transformation
            X_copy['log_' + col] = np.log1p(X_copy[col])
            auc_log = self._evaluate_transformation(X_copy[['log_' + col]], y)
            if auc_log > best_auc:
                best_auc = auc_log
                best_transformation = 'log'

            # Square Root Transformation
            X_copy['sqrt_' + col] = np.sqrt(X_copy[col])
            auc_sqrt = self._evaluate_transformation(X_copy[['sqrt_' + col]], y)
            if auc_sqrt > best_auc:
                best_auc = auc_sqrt
                best_transformation = 'sqrt'

            # Box-Cox Transformation
            X_copy['boxcox_' + col] = PowerTransformer(method='box-cox').fit_transform(abs(X_copy[[col]])+ 1e-5)
            auc_boxcox = self._evaluate_transformation(X_copy[['boxcox_' + col]], y)
            if auc_boxcox > best_auc:
                best_auc = auc_boxcox
                best_transformation = 'boxcox'

            self.best_transformations[col] = best_transformation
#             X[[col]] = X[self.best_transformations[col]]

        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for col in self.cont_cols:
            if self.best_transformations[col] == 'log':
                X_transformed[col] = np.log1p(X_transformed[col])
            elif self.best_transformations[col] == 'sqrt':
                X_transformed[col] = np.sqrt(X_transformed[col])
            elif self.best_transformations[col] == 'boxcox':
                X_transformed[col] = PowerTransformer(method='box-cox').fit_transform(abs(X_transformed[[col]]) + 1e-5)
        mapper_pipe = self.mapper()
        X_transformed = mapper_pipe.fit_transform(X_transformed, y)
        return X_transformed

    def _evaluate_transformation(self, X, y):
        
        model = HistGradientBoostingRegressor(random_state=42, warm_start=True, max_iter=20)
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        auc_scores = cross_val_score(model, X, y, cv=cv, scoring=make_scorer(mean_absolute_percentage_error))
#         print(auc_scores)

        return np.mean(auc_scores)
    def get_feature_names_out(self):
        pass
    def mapper(self):
        self.cat_imputer =  SimpleImputer(strategy='most_frequent')
        # Scale and encoding
        self.scaler = RobustScaler(quantile_range=(10.0, 90.0))
        self.encoder =  CatBoostEncoder(random_state = 42, drop_invariant=True)
        num_cols = make_column_selector(dtype_include=np.number)
        cat_cols = make_column_selector(dtype_include=object)
        categorical_imputer = Pipeline([
            ('Imputer', self.cat_imputer),
            ('Encoder', self.encoder)  # Adding encoding for categorical data
        ])
        imput = ColumnTransformer([
            ('categorical_imputer', categorical_imputer, cat_cols),
            
        ],
            remainder='passthrough' # remainder='passthrough' to keep columns not specified
        )  
    
        pipe = Pipeline([
            
            ('Imputer', imput),
            
            ('DropDuplicateFeatures', DropDuplicateFeatures()),
            ('DropConstantFeatures', DropConstantFeatures()),
            ('DropCorrelatedFeatures', DropCorrelatedFeatures(threshold=0.95)),
            ('scaler', self.scaler),  # Applies scaling to numerical features only
        ])

        pipe.set_output(transform="pandas")
        return pipe

In [143]:
# prep = Preprocessing(CFG)
# X, y = prep.forward()

In [144]:
# mapper = Mapper()

In [145]:
# mapper = mapper(X)

In [146]:
# df = mapper.fit_transform(X,y)

In [147]:
# trans = CustomTransformer()
# trans.fit_transform(df,y)


In [148]:
# trans.transform(df)

In [149]:
# trans.best_transformations

# IterativeImputer

In [150]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

class IterativeImputer(BaseEstimator, TransformerMixin):

    def __init__(self, max_iterations=3, model_params={}):
        self.max_iterations = max_iterations
        self.model_params = model_params
        self.features = None
        self.mapper_ = self.mapper()
        self.rows_miss = None
        self.error_minimize = None
        self.lgb_params = {
            'n_estimators': 100,
            'max_depth': 5,
            "num_leaves": 16,
            'learning_rate': 0.05,
            'subsample': 0.7,
            'colsample_bytree': 0.8,
            #'reg_alpha': 0.25,
            'reg_lambda': 5e-07,
            'objective': 'regression_l2',
            'metric': 'mean_squared_error',
#             'boosting_type': 'gbdt',
#             'random_state': 42,
        }
    def fit(self, X, y=None):
        """Fit the imputer to the data and identify features with missing values."""
        self.features = [f for f in X.columns if X[f].isna().sum() > 0]
#         print(self.features)
        return self

    def transform(self, X):
        X_temp=X.copy()
        """Impute missing values using iterative imputation."""
        
        missing_rows = self.store_missing_rows(X_temp, self.features)
        for f in self.features:
            X_temp[f]=X_temp[f].fillna(X_temp[f].mean())
        cat_features = [f for f in X_temp.columns if not pd.api.types.is_numeric_dtype(X_temp[f])]
        dictionary = {feature: [] for feature in self.features}
        if len(self.features)>0:
            for iteration in tqdm(range(self.max_iterations), desc="Iterations"):
    #             print(1)
                for feature in self.features:
            #                 # Skip features with no missing values
                    self.rows_miss =  missing_rows[feature].index
                    missing_temp = X_temp.loc[self.rows_miss].copy()
                    non_missing_temp = X_temp.drop(index=self.rows_miss).copy()
                    y_pred_prev=missing_temp[feature]
                    missing_temp = missing_temp.drop(columns=[feature])
                    # Step 3: Use the remaining features to predict missing values using Random Forests
                    X_train = non_missing_temp.drop(columns=[feature])
                    y_train = non_missing_temp[[feature]]
                    mapper_pipe = self.mapper_
                    
                    X_train = mapper_pipe.fit_transform(X_train, y_train)
                    model= lgb.LGBMRegressor(**self.lgb_params,random_state=42,boosting_type='dart')
                    model.fit(X_train, y_train)
                    # Step 4: Predict missing values for the feature and update all N features
                    y_pred = model.predict(mapper_pipe.transform(missing_temp))
                    X_temp.loc[self.rows_miss, feature] = y_pred
                    self.error_minimize=self.rmse(y_pred,y_pred_prev)
                    dictionary[feature].append(self.error_minimize)  # Append the error_minimize value
#                     print(self.error_minimize)
    #                 print(2)
            X_temp[self.features] = np.array(X_temp.iloc[:X_temp.shape[0]][self.features])
            X_temp = X_temp.drop(columns=cat_features)    
            return X_temp
        return X
    def store_missing_rows(self, df, features):
        """Function stores where missing values are located for given set of features."""
        missing_rows = {}

        for feature in features:
            missing_rows[feature] = df[df[feature].isnull()]
        
        return missing_rows
    def get_feature_names_out(self):
        pass
    def rmse(self, y1, y2):
        from sklearn.metrics import mean_squared_error
        """RMSE Evaluator"""
        return (np.sqrt(mean_squared_error(np.array(y1), np.array(y2))))

        
    def mapper(self):
        self.cat_imputer =  SimpleImputer(strategy='most_frequent')
        # Scale and encoding
        self.scaler = RobustScaler(quantile_range=(10.0, 90.0))
        self.encoder =  CatBoostEncoder(random_state = 42, drop_invariant=True)
        num_cols = make_column_selector(dtype_include=np.number)
        cat_cols = make_column_selector(dtype_include=object)
        categorical_imputer = Pipeline([
            ('Imputer', self.cat_imputer),
            ('Encoder', self.encoder)  # Adding encoding for categorical data
        ])
        imput = ColumnTransformer([
            ('categorical_imputer', categorical_imputer, cat_cols),
            
        ],
            remainder='passthrough' # remainder='passthrough' to keep columns not specified
        )  
    
        pipe = Pipeline([
            
            ('Imputer', imput),
            ('DropDuplicateFeatures', DropDuplicateFeatures()),
            ('DropConstantFeatures', DropConstantFeatures()),
            ('DropCorrelatedFeatures', DropCorrelatedFeatures(threshold=0.95)),
            ('scaler', self.scaler),  # Applies scaling to numerical features only
        ])

        pipe.set_output(transform="pandas")
        return pipe

# iter_imp=IterativeImputer()

# X, y = prep.forward()

# X_temp = iter_imp.fit_transform(X,y)




# mapper

In [151]:
from sklearn.compose import ColumnTransformer, make_column_selector
class Mapper:
    # Pipeline for transforming data
    # input X
    # out transformed X
    def __init__(self):
        # imputers
        self.iter_imputer = True
        self.num_imputer = IterativeImputer()
        self.cat_imputer =  SimpleImputer(strategy='most_frequent')
        # Scale and encoding
        self.scaler = RobustScaler(quantile_range=(10.0, 90.0))
        self.encoder =  CatBoostEncoder(random_state = 42, drop_invariant=True)
    def numerical_imputer(self, num_imputer):
        return Pipeline([('Imputer', num_imputer)])
    
    def categorical_imputer(self, cat_imputer):
        return Pipeline([('Imputer', cat_imputer)])
    

    
    def get_type(self, df):
        if self.iter_imputer is True:
            num_cols = make_column_selector()
        else:
            num_cols = make_column_selector(dtype_include=np.number)
            
        cat_cols = make_column_selector(dtype_include=object)

        return num_cols, cat_cols
    
    def pipeline(self, num_cols, cat_cols):
        
        numerical_imputer = self.numerical_imputer(self.num_imputer)
        categorical_imputer = Pipeline([
            ('Imputer', self.categorical_imputer(self.cat_imputer)),
            ('Encoder', self.encoder)  # Adding encoding for categorical data
        ])
        imput = ColumnTransformer([
            ('numerical_imputer', numerical_imputer, num_cols),
            ('categorical_imputer', categorical_imputer, cat_cols),
            
        ],
#             remainder='passthrough' # remainder='passthrough' to keep columns not specified
        )  
    
        pipe = Pipeline([
            ('Imputer', imput),
            ('DropDuplicateFeatures', DropDuplicateFeatures()),
            ('DropConstantFeatures', DropConstantFeatures()),
            ('DropCorrelatedFeatures', DropCorrelatedFeatures(threshold=0.95)),
            ('scaler', self.scaler),  # Applies scaling to numerical features only
        ])

        return pipe 
    def __call__(self, df):
        num_cols, cat_cols = self.get_type(df)
        pipe = self.pipeline(num_cols, cat_cols)
        pipe.set_output(transform="pandas")
        return pipe

In [152]:
import numpy as np
import pandas as pd
from typing import List
from collections import Counter

def detect_outliers(df: pd.DataFrame) -> List[int]:
    """
    Detects outliers in a DataFrame using the Interquartile Range (IQR) method.

    Outliers are determined based on the IQR, calculated as the difference
    between the 85th and 15th percentile of the data. An outlier is defined as
    a data point that lies outside the 1.5 * IQR range from the quartiles.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing numerical data.

    Returns
    -------
    multiple_outliers : List[int]
        List of index positions in the DataFrame where outliers occur more than 3 times.
    """
    # List to hold the indices of outliers
    outlier_indices = []

    # Identifying numerical features in the DataFrame
    numerical_features = df.select_dtypes(include=['int16', 'float16', 'int32', 'float32', 'int64', 'float64']).columns

    for c in numerical_features:
        # 1st quartile (15th percentile)
        Q1 = np.percentile(df[c], 15)
        # 3rd quartile (85th percentile)
        Q3 = np.percentile(df[c], 85)
        # Interquartile Range (IQR)
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5

        # Detecting outliers and their indices
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index

        # Storing indices of outliers
        outlier_indices.extend(outlier_list_col)

    # Counting occurrences of each index
    outlier_indices = Counter(outlier_indices)
    # Identifying indices with more than 3 occurrences as multiple outliers
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 3)

    return multiple_outliers


In [153]:
def outliers(X, y):
    
    Outliers_to_drop = detect_outliers(X)
    
    X = X.drop(Outliers_to_drop, axis=0)
    y = y.drop(Outliers_to_drop, axis=0)
    return X, y

In [154]:
# Split
def generate_comprehensive_features(df):
    """
    Generate comprehensive statistical features for all numerical columns in a DataFrame 
    and return the DataFrame with the new features included.

    :param df: The DataFrame with the original data.
    :return: The DataFrame with new statistical feature columns for each numerical column.
    """
    new_features = pd.DataFrame(index=df.index)
    numerical_cols = df.select_dtypes(include=['number']).columns
    
    for col in numerical_cols:
#         new_features[f"{col}_global_mean"] = [df[col].mean()]* len(df)
#         new_features[f"{col}_global_std"] = [df[col].std()]* len(df)
#         new_features[f"{col}_global_median"] = [df[col].median()]* len(df)
#         new_features[f"{col}_global_diff"] = [df[col].diff()]* len(df)
        # Basic statistics
        new_features[f'{col}_min'] = [df[col].min()] * len(df)
        new_features[f'{col}_max'] = [df[col].max()] * len(df)
        new_features[f'{col}_range'] = [df[col].max() - df[col].min()] * len(df)
        new_features[f'{col}_sum'] = [df[col].sum()] * len(df)
        new_features[f'{col}_var'] = [df[col].var()] * len(df)
        new_features[f'{col}_coef_var'] = [df[col].std() / df[col].mean() if df[col].mean() != 0 else 0] * len(df)
        new_features[f'{col}_skew'] = [df[col].skew()] * len(df)
        new_features[f'{col}_kurt'] = [df[col].kurt()] * len(df)

        # Quantiles
        new_features[f'{col}_25%'] = [df[col].quantile(0.25)] * len(df)
        new_features[f'{col}_50%'] = [df[col].quantile(0.5)] * len(df)
        new_features[f'{col}_75%'] = [df[col].quantile(0.75)] * len(df)

        # Cumulative statistics
        new_features[f'{col}_cumsum'] = df[col].cumsum()
        new_features[f'{col}_cummax'] = df[col].cummax()
        new_features[f'{col}_cummin'] = df[col].cummin()

        # Exponential Moving Average
        new_features[f'{col}_ema'] = df[col].ewm(span=10, adjust=False).mean()

    # Concatenate the new features with the original DataFrame
    df = pd.concat([df, new_features], axis=1)
    
    return df        

# Split

In [155]:
from typing import Tuple, List
from pandas import DataFrame
from sklearn.model_selection import train_test_split, KFold

class TrainTestValidSplit:
    """
    This class provides methods for splitting datasets into training, testing,
    and validation sets for machine learning models.

    Methods
    -------
    train_test_split(X, y, test_size, seed):
        Splits the dataset into training and validation sets.

    cross_valid(X, y, n_splits=5):
        Performs cross-validation and returns the split datasets.
    """

    def train_test_split(self, X: DataFrame, y: DataFrame, test_size: float, seed: int) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
        """
        Splits the dataset into training and validation sets based on the specified test size and random seed.

        Parameters
        ----------
        X : DataFrame
            Feature dataset.
        y : DataFrame
            Target dataset.
        test_size : float
            Proportion of the dataset to include in the test split.
        seed : int
            Random seed for reproducibility.

        Returns
        -------
        X_train, X_val, y_train, y_val : Tuple[DataFrame, DataFrame, DataFrame, DataFrame]
            Training and validation sets of features and targets.
        """
        # Splitting the dataset into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size,
                                                          random_state=seed,
                                                          shuffle=True)
        return X_train, X_val, y_train, y_val

    def cross_valid(self, X: DataFrame, y: DataFrame, n_splits: int) -> List[Tuple[DataFrame, DataFrame, DataFrame, DataFrame]]:
        """
        Performs K-Fold cross-validation on the dataset.

        Parameters
        ----------
        X : DataFrame
            Feature dataset.
        y : DataFrame
            Target dataset.
        n_splits : int, optional
            Number of folds. Default is 5.

        Returns
        -------
        cross_valid : List[Tuple[DataFrame, DataFrame, DataFrame, DataFrame]]
            List of tuples containing the train-test split for each fold.
        """
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        cross_valid = []
        for train_index, test_index in kf.split(X):
            # Splitting the dataset for each fold
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            cross_valid.append((X_train, X_test, y_train, y_test))
        
        return cross_valid


In [156]:
import pandas as pd
from typing import List
from sklearn.impute import SimpleImputer, KNNImputer 
from sklearn.preprocessing import RobustScaler, OrdinalEncoder

class Preprocessing:
    """
    Class for preprocessing data for machine learning.

    Methods:
        read_data: Reads data from a CSV file.
        set_type: Converts all columns in the dataframe to numeric types.
        select_columns: Filters the dataframe to include only selected columns.
        forward: Runs the full preprocessing pipeline.
    """

    def __init__(self, CFG=None):
        """
        Initializes the Preprocessing class with configuration.

        Args:
            CFG: Configuration object containing settings like file path and selected columns.
        """
        self.path: str = CFG.PATH
        self.selected_columns = CFG.SELECTED_COLUMNS
        self.target = CFG.TARGET
        self.test_size = CFG.TEST_SIZE
        self.seed = CFG.SEED
        self.cross_valid = CFG.CROSS_VALID
        if self.cross_valid is True:
            self.n_splits = CFG.N_SPLIT

        
        # after optimize set_type remove
        self.int_columns = CFG.int_columns
        self.float_columns = CFG.float_columns
        
        self.split = TrainTestValidSplit()
        
    def read_data(self, path: str) -> pd.DataFrame:
        """
        Reads data from the given CSV file path.

        Args:
            path: The file path of the CSV data.

        Returns:
            DataFrame containing the read data.
        """
        try:
            return pd.read_csv(path, low_memory=False, dtype='str')
        except FileNotFoundError:
            raise FileNotFoundError(f"CSV file not found at the specified path: {path}")
        except IOError as e:
            raise IOError(f"Error occurred while reading the CSV file at {path}: {e}")
    
    def clinning(self, df):
        for column in df.columns:
            df[column] = df[column].astype(str).str.strip().str.replace(' ', '').copy()
            df[column] = df[column].astype(str).str.strip().str.replace(',', '.').copy()
        return df
    
    def find_type(self, df):
        for column in df.columns:
            df[column] = df[column].astype(str).str.strip().replace(' ', '').replace(',', '.')
        for column in self.int_columns:
            df[column] = df[column].astype(int)
        for column in self.float_columns:
            df[column] = df[column].astype(float)
        return df
    
    def select_columns(self, df: pd.DataFrame, selected_columns: List[str]) -> pd.DataFrame:
        """
        Filters the dataframe to include only selected columns.

        Args:
            df: The dataframe to be filtered.
            selected_columns: The list of column names to be included.

        Returns:
            Filtered DataFrame.
        """
        return df[selected_columns]
    
    
    def handle_mixed_types(self, df):
        for column in df.columns:
            if df[column].apply(type).nunique() > 1:
                df[column + '_num'] = pd.to_numeric(df[column], errors='coerce')
                df[column + '_cat'] = df[column].where(df[column + '_num'].isna())
        return df
    
    def get_X_y(self, df, target):
        
        df = df.drop('Кадастровый_номер_ob', axis=1).copy()
        X = df.drop(target, axis=1).copy()
        y = df[target].copy()
        return X, y
    
    def etl(self) -> pd.DataFrame:
        """
        Runs the full preprocessing pipeline.

        Returns:
            Preprocessed DataFrame ready for machine learning tasks.
        """
        df = self.read_data(self.path)
        df = self.select_columns(df, self.selected_columns)
        df = df.drop_duplicates(subset = ['Кадастровый_номер_ob', 'дата_ob', self.target])
        df = self.clinning(df)
        df = self.find_type(df)
        df = self.handle_mixed_types(df.copy())
        
        return df
    def forward(self) -> pd.DataFrame:
        df = self.etl()
        X, y = self.get_X_y(df, self.target)
        
        return X, y
    
        

In [157]:
# prep = Preprocessing(CFG, Mapper)

In [158]:
# prep.forward()

In [159]:
prep = Preprocessing(CFG)

# Model

In [160]:
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [161]:
from sklearn.isotonic import IsotonicRegression

In [162]:
from sklego.linear_model import LADRegression

In [163]:
from typing import Any
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, StackingRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
from sklearn.linear_model import BayesianRidge
class Model:
    def __init__(self, CFG: Any) -> None:
        """
        Initialize the Model with configuration parameters.

        Parameters:
        CFG (Any): Configuration object containing settings like random seed and number of jobs.

        Attributes:
        random_state (int): Random state seed for reproducibility.
        n_jobs (int): Number of parallel jobs to run.
        tree (ExtraTreesRegressor): Extra Trees regressor model.
        grad (GradientBoostingRegressor): Gradient Boosting regressor model.
        xgb (XGBRegressor): XGBoost regressor model.
        lgbm (LGBMRegressor): LightGBM regressor model.
        hist (HistGradientBoostingRegressor): Histogram-based Gradient Boosting Regression Tree model.
        model (NoneType): Placeholder for the final model, to be defined later.
        pipeline (Pipeline): The pipeline that will contain preprocessing and the stacking regressor.
        """
        self.random_state: int = CFG.SEED
        self.n_jobs: int = CFG.N_JOBS
#         self.tree: ExtraTreesRegressor = HistGradientBoostingRegressor(random_state=self.random_state,warm_start=True, max_iter=50)
        self.tree: ExtraTreesRegressor = LADRegression()
        self.grad: GradientBoostingRegressor = GradientBoostingRegressor(random_state=self.random_state)
        self.xgb: XGBRegressor = XGBRegressor(random_state=self.random_state, n_jobs=self.n_jobs)
        self.lgbm: LGBMRegressor = LGBMRegressor(
            random_state=self.random_state,
            n_jobs=self.n_jobs,
            verbose=0,
            force_row_wise=True,
            objective='mae'
        )
        if CFG.BAGGING is True:
            self.hist: HistGradientBoostingRegressor = BaggingRegressor(estimator=HistGradientBoostingRegressor(random_state=self.random_state, warm_start=True, max_iter=50),
                                                                   random_state=CFG.SEED, n_jobs=CFG.N_JOBS, n_estimators=CFG.BAGGING_ESTIMATORS)
        else:
            self.hist: HistGradientBoostingRegressor = HistGradientBoostingRegressor(random_state=self.random_state, warm_start=True, max_iter=50)
        self.model = None
        self.initialize_model()

    def initialize_model(self) -> Pipeline:
        """
        Initialize the stacking regressor model.

        Returns:
        Pipeline: A pipeline object that includes the stacking regressor.
        """
        base_estimators = [
            ('ExtraTree', self.tree),  # Uncomment if you want to include ExtraTreesRegressor in your stacking
            ('GradientBoost', self.grad),
            ('XGBoost', self.xgb),
            ('LightGBM', self.lgbm),
        ]
        
        stack_reg: StackingRegressor = StackingRegressor(estimators=base_estimators, final_estimator=self.hist, n_jobs=self.n_jobs)
        self.pipeline: Pipeline = Pipeline([
            ('stacking_regressor', stack_reg)
        ])

        return self.pipeline

In [164]:
Preprocessing(CFG)
split = TrainTestValidSplit()
model = Model(CFG)

In [165]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnRenamer(BaseEstimator, TransformerMixin):
    def __init__(self, prefixes_to_remove):
        self.prefixes_to_remove = prefixes_to_remove

    def fit(self, X, y=None):
        return self  # Nothing to do here

    def transform(self, X):
        X = X.copy()
        for prefix in self.prefixes_to_remove:
            X.columns = [col.replace(prefix, '') for col in X.columns]
        return X

# Run

In [166]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, median_absolute_error
from sklearn.pipeline import Pipeline
from utils.TrainTestValidSplit import TrainTestValidSplit
from preprocessing.Preprocessing import Preprocessing
from utils.detect_outliers import outliers
from utils.ColumnRenamer import ColumnRenamer

import pickle
import json
import numpy as np
import pandas as pd
from typing import Optional


class Run:
    """
    A class for managing machine learning pipelines, including preprocessing, training,
    cross-validation, evaluation, and model serialization.

    Attributes
    ----------
    CFG : Any
        Configuration object containing settings for the run.
    mapper : Any
        Object responsible for mapping data transformations.
    model : Any
        Machine learning model to be used.

    Methods
    -------
    initialize_pipeline(X_train):
        Initializes the processing pipeline with training data.

    make_weight_tune():
        Performs weight tuning on the model.

    run():
        Executes the pipeline, including data splitting, preprocessing, training, and evaluation.

    metrics():
        Computes and prints various evaluation metrics.

    get_model():
        Returns the trained model pipeline.

    save_model():
        Saves the model to a file.

    load_model(path=None):
        Loads a model from a file.
    """
    def __init__(self, CFG, mapper, model) -> None:
        self.save_path = CFG.SAVE_PATH
        self.load_path = CFG.LOAD_PATH
        self.save_method = CFG.SAVE_METHOD
        self.test_size = CFG.TEST_SIZE
        self.seed = CFG.SEED
        self.target = CFG.TARGET
        self.cross_valid = CFG.CROSS_VALID

        if self.cross_valid is True:
            self.n_splits = CFG.N_SPLIT
        self.mapper = mapper
        self.split = TrainTestValidSplit()
        self.model = model

        self.prep = Preprocessing(CFG)
        self.column_renamer = ColumnRenamer(prefixes_to_remove=[
            'numerical_imputer__', 'categorical_imputer__'
        ])
        self.pipe = None
        self.df1 = pd.DataFrame()

    def initialize_pipeline(self, X_train) -> None:
        mapper = self.mapper(X_train)

        self.pipe = Pipeline([
            ('mapper', mapper),
            ('Column_renamer', self.column_renamer),
#             ('CustomTransformer',CustomTransformer()),
#             ('GenerateFeatures', GenerateFeatures()),
            ('model', self.model.pipeline)])
    def make_weight_tune(self, X_train, y_train, base_weight=0.01, lower=0.001, upper=0.06):
        """
        Performs weight tuning on the model.
        """
        self.pipe.fit(X_train, y_train)
        y_train_pred = self.pipe.predict(X_train)
        y_train_error = np.abs(y_train - y_train_pred)
        sample_weights = base_weight + ((lower < y_train_error) & (y_train_error < upper)).astype(float)

        # Correctly pass sample_weight to the specific model step
        fit_params = {'model__stacking_regressor__sample_weight': sample_weights}
        self.pipe.fit(X_train, y_train, **fit_params)



    def run(self) -> Optional[Pipeline]:
#         split (think about it maybe delete from here)
#         transform
#         feature selection and etc
#         model
        if self.cross_valid is False:
            X, y = self.prep.forward()

            X_train, X_val, y_train, y_val = self.split.train_test_split(
                X, y, self.test_size, self.seed)
            self.X_val, self.y_val = X_val, y_val
            X_train, y_train = outliers(X_train, y_train)
            # initialize pipeline
            self.initialize_pipeline(X_train)

            self.pipe.fit(X_train, y_train)

            return self.pipe
        else:
            X, y = self.prep.forward()
            X_train, X_val, y_train, y_val = self.split.train_test_split(
                    X, y, self.test_size, self.seed)

            self.X_val, self.y_val = X_val, y_val
            X_train, y_train = outliers(X_train, y_train)
            cross_valid_splits = self.split.cross_valid(
                X_train, y_train, self.n_splits)
            n_splits = len(cross_valid_splits)  # Number of splits
            # initialize pipeline
            self.initialize_pipeline(X_train)

#             score_without, score_with = 0, 0
            for X_train, X_test, y_train, y_test in cross_valid_splits:
                
#                 self.pipe.fit(X_train, y_train)
                # Performs weight tuning on the model.
                self.make_weight_tune(X_train, y_train)

    def metrics(self, model) -> None:
        y_pred = model.predict(self.X_val)
        print('MAE', mean_absolute_error(self.y_val, y_pred))
        print('MAPE', mean_absolute_percentage_error(self.y_val, y_pred))
        print('R2_score', r2_score(self.y_val, y_pred))
        print('mean', y_pred.mean())
        print('std', y_pred.std())

    def get_model(self) -> Pipeline:
        return self.pipe

    def save_model(self) -> None:
        if self.save_method == 'pickle':
            import pickle
            pickle.dump(self.pipe, open(f"saved_models/{self.save_path}.pickle", 'wb'))

    def load_model(self, path = None):
        if self.save_method == 'pickle':
            import pickle
            self.pipe = pickle.load(open(f"saved_models/{self.load_path}.pickle", 'rb'))
            print('model loaded')
            self.initialize_predict()

    def initialize_predict(self):
        df1 = pd.DataFrame()
        df = self.prep.etl()
        df1  = df[['Кадастровый_номер_ob', self.target]].copy()
        X, y = self.prep.get_X_y(df, self.target)
        df1[self.target] = self.pipe.predict(X)
        self.df1 = df1

    def for_api(self, kadastr,df=None):
        res_df = self.df1[self.df1['Кадастровый_номер_ob'] == kadastr]
        res = res_df.to_json(orient="records")
        parsed = json.loads(res)
        return parsed

In [167]:
import pickle

In [168]:
# import skops.io as sio
# obj = sio.dump(model, "saved_models/model.skops")

In [169]:
# k = pickle.load(open(f"saved_models/{CFG.LOAD_PATH}.pickle", 'rb'))

# Metrics

In [None]:
pipe = Run(CFG, Mapper(), Model(CFG))

pipe.run()
model = pipe.get_model()
pipe.metrics(model)

Iterations: 100%|██████████| 3/3 [00:07<00:00,  2.34s/it]
Iterations: 100%|██████████| 3/3 [00:07<00:00,  2.45s/it]
Iterations:  33%|███▎      | 1/3 [00:02<00:04,  2.05s/it]

In [None]:
pipe.metrics(model)

In [None]:
# MAE 55974.17974005252
# MAPE 0.10650260600282552
# R2_score 0.5877333226486814
# mean 543468.5212345918
# std 92553.38307449795

In [None]:
# MAE 57455.71822476894
# MAPE 0.10879462453450428
# R2_score 0.5540224099434927
# mean 542924.7683698069
# std 90117.77702643277

In [None]:
# MAE 58559.68338508163
# MAPE 0.11048791721865418
# R2_score 0.5394781775327182
# mean 543792.2710455992
# std 89703.40279469808

In [None]:
# df = pipe.for_api()

In [None]:
# df['Кадастровый_номер_ob']

In [None]:
pipe.save_model()

In [None]:
pipe.load_model()

In [None]:
def for_api(self, df=None):
    
    df1 = pd.DataFrame()
    df = self.prep.etl()
    df1  = df[['Кадастровый_номер_ob', self.target]].copy()
    df = self.prep.etl()
    X, y = self.prep.get_X_y(df, self.target)
    df1[self.target] = self.pipe.predict(X)
    return df


In [None]:
model = pipe.get_model()

In [None]:
model

In [None]:

import pickle
# # let's save the model
# model_path = "example.pkl"
# local_repo = "my-awesome-model"
# with open(model_path, mode="bw") as f:
#     pickle.dump(pipe, file=f)
# # we will now initialize a local repository
# hub_utils.init(
#     model=model_path,
#     requirements=[f"scikit-learn={sklearn.__version__}"],
#     dst=local_repo,
#     task="tabular-classification",
#     data=X_test,
# )

In [None]:
X, y = prep.forward()

In [None]:
# X = X.drop_duplicates(subset=['дата_ob', ])

In [None]:
def metrics(X,y,model) -> None:
        y_pred = model.predict(X)
        print('MAE', mean_absolute_error(y, y_pred))
        print('MAPE', mean_absolute_percentage_error(y, y_pred))
        print('R2_score', r2_score(y, y_pred))
        print('mean', y_pred.mean())
        print('std', y_pred.std())

In [None]:
metrics(X,y,model)

In [None]:
# MAE 47959.73187672039
# MAPE 0.09088530212124311
# R2_score 0.6566025139314876
# mean 541942.2512662262
# std 90440.25246178386

# Result