In [2]:
import pandas as pd
import numpy as np
import pickle
import json
import config

In [3]:
train = pd.read_csv(config.CONFIG['paths']['train'])
test = pd.read_csv(config.CONFIG['paths']['test'])
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
id_col = config.CONFIG['preprocessing']['id_column']
target_col = config.CONFIG['preprocessing']['target_column']
transform_target = config.CONFIG['preprocessing'].get('target_transform')
encode_method = config.CONFIG['preprocessing'].get('encode_method', 'onehot')
scale_numeric = config.CONFIG['preprocessing'].get('scale_numeric', True)
num_impute = config.CONFIG['preprocessing'].get('numeric_imputer', 'median')
cat_impute = config.CONFIG['preprocessing'].get('categorical_imputer', 'most_frequent')

numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in [id_col, target_col]]
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

print('Числовые колонки', len(numeric_cols))
print('Категориальных', len(categorical_cols))

Числовые колонки 36
Категориальных 43


делаем таргет ЛОГ

In [5]:
train_imp = train.copy()
test_imp = test.copy()

if transform_target == 'log1p' and target_col in train_imp.columns:
    train_imp[target_col] = np.log1p(train_imp[target_col])

Заполняем пропуски

In [8]:
for col in numeric_cols:
    if col in train_imp.columns:
        stat = train_imp[col].median() if num_impute == 'median' else train_imp[col].mean()
        train_imp[col] = train_imp[col].fillna(stat)
    if col in test_imp.columns:
        stat = train_imp[col].median() if num_impute == 'median' else train_imp[col].mean()
        test_imp[col] = test_imp[col].fillna(stat)

In [9]:
for col in categorical_cols:
    if col in train_imp.columns:
        mode_val = train_imp[col].mode()
        fill_val = mode_val[0] if len(mode_val) else 'Unknown'
        train_imp[col] = train_imp[col].fillna(fill_val)
    if col in test_imp.columns:
        mode_val = train_imp[col].mode()
        fill_val = mode_val[0] if len(mode_val) else 'Unknown'
        test_imp[col] = test_imp[col].fillna(fill_val)

In [10]:
print('Пропуски в train:', train_imp.isnull().sum().sum())
print('Пропуски в test:', test_imp.isnull().sum().sum())

Пропуски в train: 0
Пропуски в test: 0


Кодируем категориальные фичи

In [12]:
from sklearn.preprocessing import LabelEncoder

encoders = {}
train_enc = train_imp.copy()
test_enc = test_imp.copy()

if encode_method == 'label':
    for col in categorical_cols:
        if col not in train_enc.columns:
            continue
        le = LabelEncoder()
        combined = pd.concat([train_enc[col].astype(str), test_enc[col].astype(str)], ignore_index=True)
        le.fit(combined.unique())
        train_enc[col] = le.transform(train_enc[col].astype(str))
        test_enc[col] = test_enc[col].astype(str).map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
        encoders[col] = le

In [13]:
if encode_method == 'onehot':
    train_enc = pd.get_dummies(train_imp, columns=categorical_cols, dtype=float)
    test_enc = pd.get_dummies(test_imp, columns=categorical_cols, dtype=float)
    desired_cols = [c for c in train_enc.columns if c != target_col]
    test_enc = test_enc.reindex(columns=desired_cols, fill_value=0)

In [15]:
feature_cols = [c for c in train_enc.columns if c not in [id_col, target_col]]
print('Признаков после кодирования:', len(feature_cols))

Признаков после кодирования: 287


Мастштабирование признаков

In [16]:
from sklearn.preprocessing import StandardScaler

scale_cols = [c for c in numeric_cols if c in train_enc.columns]
scaler_params = {}

if scale_numeric and scale_cols:
    scaler = StandardScaler()
    train_enc[scale_cols] = scaler.fit_transform(train_enc[scale_cols])
    test_enc[scale_cols] = scaler.transform(test_enc[scale_cols])
    for i, c in enumerate(scale_cols):
        scaler_params[c] = {'mean': float(scaler.mean_[i]), 'std': float(np.sqrt(scaler.var_[i]))}

In [17]:
df_train_out = train_enc.copy()
df_test_out = test_enc.copy()
if target_col in df_test_out.columns:
    df_test_out = df_test_out.drop(colunbs=[target_col], errors='ignore')

In [18]:
preprocessed_path = config.CONFIG['paths']['train_preprocessed']
test_preprocessed_path = config.CONFIG['paths']['test_preprocessed']

df_train_out.to_csv(preprocessed_path, index=False)
df_test_out.to_csv(test_preprocessed_path, index=False)
print('Train сохранён:', preprocessed_path, df_train_out.shape)
print('Test сохранён:', test_preprocessed_path, df_test_out.shape)

Train сохранён: C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints\train_preprocessed.csv (1460, 289)
Test сохранён: C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints\test_preprocessed.csv (1459, 288)


In [19]:
preprocessing_metadata = {
    'encoders': list(encoders.keys()) if encode_method == 'label' else [],
    'scaler_params': scaler_params,
    'final_columns': df_train_out.columns.tolist(),
    'target_column': target_col,
    'id_column': id_col,
    'preprocessing_config': {
        'target_transform': transform_target,
        'encode_method': encode_method,
        'scale_numeric': scale_numeric,
        'numeric_imputer': num_impute,
        'categorical_imputer': cat_impute,
    },
    'dataset_info': {
        'original_shape': list(train.shape),
        'preprocessed_shape': list(df_train_out.shape),
        'numeric_columns': numeric_cols,
        'categorical_columns': categorical_cols,
    }
}

In [20]:
with open(config.CONFIG['paths']['metadata_json'], 'w', encoding='utf-8') as f:
    json.dump(preprocessing_metadata, f, indent=2, ensure_ascii=False)

with open(config.CONFIG['paths']['metadata_pickle'], 'wb') as f:
    pickle.dump(preprocessing_metadata, f)

encoders_path = config.CONFIG['paths']['checkpoint_dir'] / 'encoders.pkl'
with open(encoders_path, 'wb') as f:
    pickle.dump(encoders, f)

print('Метаданные и энкодеры сохранены в', config.CONFIG['paths']['checkpoint_dir'])

Метаданные и энкодеры сохранены в C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints
