# Pipeline

In [179]:
import pandas as pd
import numpy as np
from scipy.special import boxcox1p
import joblib

In [180]:
data = pd.read_csv('../../data/raw_data/new_data_csv.csv')

In [181]:
if 'target' in data.columns:
    data = data.drop(columns=['target'])

data = data.drop_duplicates()

data = data.drop('ID', axis=1)

data['sex'] = data['sex'].fillna('M')

data['VIP'] = data['VIP'].fillna('Обычный клиент')

# Так как строки удалять нельзя, то заменим -100 на модальное значение
data['delta_kbm'] = data['delta_kbm'].replace(-100, data['delta_kbm'].mode()[0])

In [182]:
lambda_df = pd.read_csv('../../data/data_after_EDA/lambda_values_box-cox.csv')
lambda_dict = lambda_df.set_index('feature')['lambda'].to_dict()

# Преобразование Box-Cox числовых признаков
for col in data.select_dtypes(include=np.number).columns:
    if col in lambda_dict:
        data[col] = boxcox1p(data[col], lambda_dict[col])

In [183]:
# Преобразование категориальных признаков
alpha = 1
raw_data = pd.read_csv('../../data/raw_data/full_train_data_csv.csv')
global_mean = raw_data['target'].mean()

cat_cols = data.select_dtypes(exclude=np.number).columns.tolist()

for col in cat_cols:
    category_stats = raw_data.groupby(col)['target'].agg(['count', 'mean'])
    category_stats['smoothed_value'] = (
        (category_stats['count'] * category_stats['mean'] + alpha * global_mean) 
        / (category_stats['count'] + alpha)
    )
    smoothing_map = category_stats['smoothed_value'].to_dict()
    data[col] = data[col].map(smoothing_map)
    data[col] = data[col].fillna(global_mean)

In [184]:
data.columns = [f"{col}".replace(' ', '').replace('-', '_') for col in data.columns]
data.columns = [f"{col}_encoded" for col in data.columns]

In [185]:
# Создаем новые признаки
feature_templates = [
    'married_encoded_addition_Filial_encoded',
    'deduct_encoded_addition_GAP_encoded',
    'passport_region_encoded_multiplication_price_prc',
    'Model_encoded_div_Age',
    'Filial_encoded_subtraction_product_encoded',
    'product_encoded_subtraction_Bank_encoded',
    'Yr_production_div_channel_map_encoded',
    'KBM_multiplication_Bank_encoded',
    'product_encoded_addition_passport_region_encoded',
    'married_encoded_div_Yr_production',
    'GAP_encoded_multiplication_deduct_encoded',
    'product_encoded_div_Bank_encoded',
    'Bank_encoded_div_KBM',
    'sex_encoded_multiplication_Model_encoded',
    'product_encoded_subtraction_category_encoded',
    'Bank_encoded_addition_category_encoded'
]

operations = {
    'addition': lambda a, b: a + b,
    'subtraction': lambda a, b: a - b,
    'multiplication': lambda a, b: a * b,
    'div': lambda a, b: a / b.replace(0, np.nan)
}

for feature_name in feature_templates:
    for op in operations:
        if f"_{op}_" in feature_name:
            left_part, right_part = feature_name.split(f"_{op}_")
            
            # Поиск левой колонки
            left_col = None
            for possible in [left_part, f"{left_part}_encoded"]:
                if possible in data.columns:
                    left_col = possible
                    break
            
            # Поиск правой колонки
            right_col = None
            for possible in [right_part, f"{right_part}_encoded"]:
                if possible in data.columns:
                    right_col = possible
                    break
            
            if left_col and right_col:
                if op == 'div':
                    data[feature_name] = (data[left_col] / data[right_col].replace(0, np.nan)).fillna(0)
                else:
                    data[feature_name] = operations[op](data[left_col], data[right_col])
            break

existing_features = [f for f in feature_templates if f in data.columns]
data = data[existing_features]

In [186]:
model = joblib.load('../../models/03_final_model/LogisticRegression.pkl')
predictions = model.predict(data)

new_data = pd.read_csv('../../data/raw_data/new_data_csv.csv')
new_data['target'] = predictions
new_data.to_csv('../../data/new_data_preds/new_data_with_targets.csv', index=False)