In [500]:
import sys
from pathlib import Path

# Get the folder where this notebook is running
notebook_path = Path().resolve()

# Assume 'src' is at the project root, one level up from the notebook folder
project_root = notebook_path.parent

# Construct the full path to src
src_path = project_root / "src"

# Add src to sys.path if not already there
if str(src_path) not in sys.path:
    sys.path.insert(1, str(src_path))

print(f"Added {src_path} to sys.path")

Added /home/rodolfo/Documents/Rodolfo/Python_Projects/projetos_sem_espacos/churn-powerco/powerco-churn/src to sys.path


In [501]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.gridspec as gridspec
import numpy as np
from scipy.stats import linregress
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn import set_config
from functools import reduce
import warnings
from datetime import datetime
import logging

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

from powerco_churn.EDA.basic_data_wrangling import basic_wrangling
from powerco_churn.EDA.univariate_statistics import univariate_statistics, plot_histograms_countplots
from powerco_churn.EDA.bivariate_statistics import bivariate_stats
from powerco_churn.EDA.visualizing_bivariate_statistics import generate_bar_plot
from powerco_churn.EDA.outliers import calculate_outlier_threshold
from powerco_churn.EDA.skewness import correct_skew
from powerco_churn.EDA.date_utils import parse_and_format_dates
from powerco_churn.utils.logger_utils import configure_logging


# Configuration

In [502]:
# plot will similiar to ggplot
plt.style.use('ggplot')

random_seed = 42

#set output of skllearn to be a pandas dataframe
set_config(transform_output = "pandas")


# Load Data

Two dataframes:

- client_data containing information about the clients (train and test)
- price_data containing the prices of power and energy during 2015

In [503]:
#train data
client_data_train = pd.read_csv('../data/raw/train/train_client_data.csv')

#test data
client_data_test  = pd.read_csv('../data/raw/test/test_client_data.csv')


In [504]:
#train data
price_data_train = pd.read_csv('../data/raw/train/train_price_data.csv')

#test data    
price_data_test  = pd.read_csv('../data/raw/test/test_price_data.csv')

In [505]:
raw_client_data_train = client_data_train.copy()
raw_client_data_test = client_data_test.copy()

raw_price_data_train = price_data_train.copy()
raw_price_data_test = price_data_test.copy()

#  Feature Engineering

## Client Data

### Date Features

- There is already a feature that that indicates how long the client has been with the energy company;
- Creating a new feature that indicates the length of the active contract;
- How long will it take for the contract to end?
- How long since the last modification?

The date features will be used to create three new features and then will be removed

In [506]:
date_features = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']

In [507]:
for feature in date_features:
    client_data_train[feature] = client_data_train[feature].apply(parse_and_format_dates)
    client_data_test[feature] = client_data_test[feature].apply(parse_and_format_dates)

In [508]:
client_data_train['contract_length'] = (pd.to_datetime(client_data_train['date_end'])
                                    - pd.to_datetime(client_data_train['date_activ'])).dt.days

client_data_test['contract_length'] = (pd.to_datetime(client_data_test['date_end'])
                                    - pd.to_datetime(client_data_test['date_activ'])).dt.days


# a random date selected to be more recent than the last date in the dataset
reference_date = pd.to_datetime('2020-01-01')

client_data_train['days_until_end'] = (reference_date - pd.to_datetime(client_data_train['date_end'])).dt.days
client_data_train['days_since_modification'] = (reference_date - pd.to_datetime(client_data_train['date_modif_prod'])).dt.days

client_data_test['days_until_end'] = (reference_date - pd.to_datetime(client_data_test['date_end'])).dt.days
client_data_test['days_since_modification'] = (reference_date - pd.to_datetime(client_data_test['date_modif_prod'])).dt.days

In [509]:
client_data_train.drop(columns = date_features, inplace = True)
client_data_test.drop(columns = date_features, inplace = True)

## Price Data

Creating new features from the time series of the prices in the price_data dataframe. These new features are aggregations of the prices charged the clients during 2015

- mean prices;
- last price;
- the difference between the first and the last prices;
- the slope of the linear regression attached to the prices;
- standard deviation of the prices;

In [510]:
numeric_columns_price = price_data_train.select_dtypes(include='number').columns.to_list()
numeric_columns_price

['price_off_peak_var',
 'price_peak_var',
 'price_mid_peak_var',
 'price_off_peak_fix',
 'price_peak_fix',
 'price_mid_peak_fix']

### Mean Prices

Mean prices for each client

In [511]:
mean_price_train = price_data_train.groupby('id')[numeric_columns_price].mean().reset_index()
mean_price_test  = price_data_test.groupby('id')[numeric_columns_price].mean().reset_index()


mean_price_columns = ['id'] + [f'mean_{col}' for col in mean_price_train.columns[1:]]
mean_price_train.columns = mean_price_columns
mean_price_test.columns = mean_price_columns



Creating four new features:

- Mean of the energy price in the three periods (off-peak, mid-peak and peak);
- Mean of the potency price in the three periods;
- Difference between the energy prices in the peak period and off-peak periods;
- Difference between the potency price in the peak and off-peak periods.


In [512]:
def create_new_energy_potency_mean_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create new features based on energy and potency prices.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame with new features.
    """

    energy_features  = [feature for feature in df.columns if 'var' in feature]
    potency_features = [feature for feature in df.columns if 'fix' in feature]

    df['mean_energy_price']  = df[energy_features].mean(axis = 1)
    df['mean_potency_price'] = df[potency_features].mean(axis = 1)

    df['energy_peak_minus_offpeak']  = df['mean_price_peak_var'] - df['mean_price_off_peak_var']
    df['potency_peak_minus_offpeak'] = df['mean_price_peak_fix'] - df['mean_price_off_peak_fix']

    return df



In [513]:
mean_price_train  = create_new_energy_potency_mean_features(mean_price_train)
mean_price_test   = create_new_energy_potency_mean_features(mean_price_test)

In [514]:
mean_price_train.head()

Unnamed: 0,id,mean_price_off_peak_var,mean_price_peak_var,mean_price_mid_peak_var,mean_price_off_peak_fix,mean_price_peak_fix,mean_price_mid_peak_fix,mean_energy_price,mean_potency_price,energy_peak_minus_offpeak,potency_peak_minus_offpeak
0,0002203ffbb812588b632b9e628cc38d,0.124338,0.103794,0.07316,40.701732,24.421038,16.280694,0.100431,27.134488,-0.020545,-16.280694
1,0010bcc39e42b3c2131ed2ce55246e3c,0.181558,0.0,0.0,45.31971,0.0,0.0,0.060519,15.10657,-0.181558,-45.31971
2,00114d74e963e47177db89bc70108537,0.147926,0.0,0.0,44.26693,0.0,0.0,0.049309,14.755643,-0.147926,-44.26693
3,00184e957277eeef733a7b563fdabd06,0.147637,0.0,0.0,44.26693,0.0,0.0,0.049212,14.755643,-0.147637,-44.26693
4,001987ed9dbdab4efa274a9c7233e1f4,0.122756,0.10229,0.07303,40.647427,24.388455,16.258972,0.099359,27.098285,-0.020465,-16.258972


### Last Prices

In [515]:
def create_last_price_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create new features based on the last values of the time series.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame with new features.
    """    

    last_price = df.loc[df.groupby('id')['price_date'].idxmax()]
    last_price = last_price.reset_index(drop=True)
    last_price = last_price.drop(columns = ['price_date'])

    last_price.columns = ['id'] + [f'last_{col}' for col in last_price.columns[1:]] 

    return last_price

In [516]:
last_price_train = create_last_price_features(price_data_train)
last_price_test  = create_last_price_features(price_data_test)

### Difference between first and last prices

In [517]:
def calculating_difference_last_first_prices(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create  new features that represents the difference between the last and first prices.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame with new feature.
    """

    last = df.loc[df.groupby('id')['price_date'].idxmax()].set_index('id', drop = True)
    last = last.drop(columns = ['price_date'])
    first = df.loc[df.groupby('id')['price_date'].idxmin()].set_index('id', drop = True)
    first = first.drop(columns = ['price_date'])

    difference = last - first
    difference = difference.reset_index(drop = False)
    difference.columns = ['id'] + [f'difference_{col}' for col in difference.columns[1:]]


    return difference

In [518]:
difference_last_first_prices_train  = calculating_difference_last_first_prices(price_data_train)
difference_last_first_prices_test  = calculating_difference_last_first_prices(price_data_test)


### Slope of the linear regression

In [519]:
def calculating_slopes_linear_regression(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create  new features that represents the slope of the lienar regressin
    adjusted to the prices

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame with new feature.
    """
    numeric_columns_price = [feature for feature in df.columns if pd.api.types.is_numeric_dtype(df[feature])]

    agg_dict = {
        f'{col}_slope' : (col, lambda x: linregress(np.arange(len(x)), x).slope)
        for col in numeric_columns_price
    }

    slopes_regression = df.groupby('id').agg(**agg_dict)
    slopes_regression = slopes_regression.reset_index()
    slopes_regression.head()

    return slopes_regression

In [520]:
slope_prices_train = calculating_slopes_linear_regression(price_data_train)
slopes_prices_test = calculating_slopes_linear_regression(price_data_test)

### Standard Deviation

In [521]:
std_price_train = price_data_train.groupby('id')[numeric_columns_price].std()
std_price_test  = price_data_test.groupby('id')[numeric_columns_price].std()


std_price_train = std_price_train.reset_index()
std_price_test  = std_price_test.reset_index()

std_price_train.columns = ['id'] + [f'std_{col}' for col in std_price_train.columns[1:]]
std_price_test.columns = ['id'] + [f'std_{col}' for col in std_price_test.columns[1:]]  


### Concatenating All Dataframes

In [522]:
df_list_train = [client_data_train, mean_price_train, last_price_train, difference_last_first_prices_train, slope_prices_train, std_price_train]
df_list_test  = [client_data_test, mean_price_test, last_price_test, difference_last_first_prices_test, slopes_prices_test, std_price_test]

In [523]:
client_price_train = reduce(lambda left, right: pd.merge(left, right, on='id', how='inner'), df_list_train)
client_price_test  = reduce(lambda left, right: pd.merge(left, right, on='id', how='inner'), df_list_test)


In [524]:
y_train = client_data_train['churn']
x_train = client_price_train.drop(columns = ['churn'])

y_test = client_data_test['churn']
x_test = client_price_test.drop(columns = ['churn'])

del(client_data_train, client_data_test, client_price_train, client_price_test)

## Data Preprocessing

In [525]:
univariate_stats_train_data = univariate_statistics(x_train)
univariate_stats_train_data


Unnamed: 0_level_0,type,count,missing,unique,mode,min_value,q_1,median,q_3,max_value,mean,std,skew,kurtosis
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
id,object,11684,0,11684,0002203ffbb812588b632b9e628cc38d,-,-,-,-,-,-,-,-,-
channel_sales,object,11684,0,8,foosdfpfkusacimwkcsosbicdxkicaua,-,-,-,-,-,-,-,-,-
cons_12m,int64,11684,0,9219,0,0,5609.0,14139.0,41121.5,6207104,159429.231,571752.758,6.029,43.573
cons_gas_12m,int64,11684,0,1716,0,0,0.0,0.0,0.0,4154590,27843.623,163052.758,10.009,139.96
cons_last_month,int64,11684,0,4225,0,0,0.0,813.0,3435.0,771203,16149.339,64088.349,6.378,47.921
forecast_cons_12m,float64,11684,0,11222,0.0,0.0,488.34,1099.43,2404.1825,82902.83,1868.707,2419.608,7.775,169.078
forecast_cons_year,int64,11684,0,3739,0,0,0.0,321.0,1769.0,175375,1426.479,3414.982,17.364,665.216
forecast_discount_energy,float64,11684,0,12,0.0,0.0,0.0,0.0,0.0,30.0,0.957,5.075,5.179,25.116
forecast_meter_rent_12m,float64,11684,0,3145,0.0,0.0,16.18,18.725,131.02,599.31,63.06,65.964,1.438,3.802
forecast_price_energy_off_peak,float64,11684,0,479,0.145711,0.0,0.11634,0.143166,0.146348,0.273963,0.137,0.025,-0.171,8.281


### Missing Values

In [526]:
#Trere are not missing values
x_train.isna().sum()

id                                0
channel_sales                     0
cons_12m                          0
cons_gas_12m                      0
cons_last_month                   0
forecast_cons_12m                 0
forecast_cons_year                0
forecast_discount_energy          0
forecast_meter_rent_12m           0
forecast_price_energy_off_peak    0
forecast_price_energy_peak        0
forecast_price_pow_off_peak       0
has_gas                           0
imp_cons                          0
margin_gross_pow_ele              0
margin_net_pow_ele                0
nb_prod_act                       0
net_margin                        0
num_years_antig                   0
origin_up                         0
pow_max                           0
contract_length                   0
days_until_end                    0
days_since_modification           0
mean_price_off_peak_var           0
mean_price_peak_var               0
mean_price_mid_peak_var           0
mean_price_off_peak_fix     

### Basic Wrangling

In [527]:
x_train = basic_wrangling(x_train)
x_test  = basic_wrangling(x_test)

2025-06-12 12:50:12,269 - INFO - Starting data wrangling...
2025-06-12 12:50:12,279 - INFO - Feature id removed. The proportion of unique values in the feature of type object is 100.00%
2025-06-12 12:50:12,304 - INFO - Starting data wrangling...
2025-06-12 12:50:12,308 - INFO - Feature id removed. The proportion of unique values in the feature of type object is 100.00%


### Outliers

In [528]:
feature_outliers_dict = {}

for feature in x_train.columns:
    if pd.api.types.is_numeric_dtype(x_train[feature]):
        min_thresh, max_thresh = calculate_outlier_threshold(x_train, feature)
        feature_outliers_dict[feature] = {'max_thresh': max_thresh, 'min_thresh': min_thresh}
        

In [529]:
def replace_outliers(df: pd.DataFrame, features_thresholds: dict, verbose = False) -> pd.DataFrame:
    """
    Replace outliers in a dataframe based on a dictionary of feature thresholds.

    Args:
        df (pd.DataFrame): a pandas DataFrame
        features_thresholds (dict): a dictionary of feature thresholds.
            For each feature, the dictionary contains the maximum and minimum thresholds for outlier detection.
        verbose (bool, optional): Whether to print verbose information. Defaults to False.

    Returns:
        pd.DataFrame: _
    """
    df = df.copy()
    for feature in features_thresholds:
        if verbose:
            num_outliers = (df[feature] > features_thresholds[feature]['max_thresh']).sum()
            num_outliers += (df[feature] < features_thresholds[feature]['min_thresh']).sum()
            print(f'Number of outliers in {feature}: {num_outliers}')  
        df[feature] = df[feature].mask(df[feature] > features_thresholds[feature]['max_thresh'], 
                                        features_thresholds[feature]['max_thresh'])
        df[feature] = df[feature].mask(df[feature] < features_thresholds[feature]['min_thresh'], 
                                        features_thresholds[feature]['min_thresh'])
        
    return df



In [530]:
#Number of outliers in each feature
num_outliers = {}
for feature in feature_outliers_dict:
    n_out = (x_train[feature] > feature_outliers_dict[feature]['max_thresh']).sum()
    n_out += (x_train[feature] < feature_outliers_dict[feature]['min_thresh']).sum()
    num_outliers[feature] = n_out
num_outliers

{'cons_12m': np.int64(1646),
 'cons_gas_12m': np.int64(2095),
 'cons_last_month': np.int64(1641),
 'forecast_cons_12m': np.int64(840),
 'forecast_cons_year': np.int64(1034),
 'forecast_discount_energy': np.int64(407),
 'forecast_meter_rent_12m': np.int64(124),
 'forecast_price_energy_off_peak': np.int64(179),
 'forecast_price_energy_peak': np.int64(0),
 'forecast_price_pow_off_peak': np.int64(289),
 'imp_cons': np.int64(974),
 'margin_gross_pow_ele': np.int64(532),
 'margin_net_pow_ele': np.int64(532),
 'nb_prod_act': np.int64(2546),
 'net_margin': np.int64(810),
 'num_years_antig': np.int64(309),
 'pow_max': np.int64(1243),
 'contract_length': np.int64(296),
 'days_until_end': np.int64(0),
 'days_since_modification': np.int64(112),
 'mean_price_off_peak_var': np.int64(167),
 'mean_price_peak_var': np.int64(0),
 'mean_price_mid_peak_var': np.int64(0),
 'mean_price_off_peak_fix': np.int64(311),
 'mean_price_peak_fix': np.int64(0),
 'mean_price_mid_peak_fix': np.int64(0),
 'mean_energy_p

In [531]:
x_train_no_outliers = replace_outliers(x_train, feature_outliers_dict, verbose = True)
x_test_no_outliers  = replace_outliers(x_test, feature_outliers_dict, verbose = False)

Number of outliers in cons_12m: 1646
Number of outliers in cons_gas_12m: 2095
Number of outliers in cons_last_month: 1641
Number of outliers in forecast_cons_12m: 840
Number of outliers in forecast_cons_year: 1034
Number of outliers in forecast_discount_energy: 407
Number of outliers in forecast_meter_rent_12m: 124
Number of outliers in forecast_price_energy_off_peak: 179
Number of outliers in forecast_price_energy_peak: 0
Number of outliers in forecast_price_pow_off_peak: 289
Number of outliers in imp_cons: 974
Number of outliers in margin_gross_pow_ele: 532
Number of outliers in margin_net_pow_ele: 532
Number of outliers in nb_prod_act: 2546
Number of outliers in net_margin: 810
Number of outliers in num_years_antig: 309
Number of outliers in pow_max: 1243
Number of outliers in contract_length: 296
Number of outliers in days_until_end: 0
Number of outliers in days_since_modification: 112
Number of outliers in mean_price_off_peak_var: 167
Number of outliers in mean_price_peak_var: 0
N

In [532]:
#Plot comparisson
#Plotting the original distribution and afeter the removal of the outliers


def plot_comparisson_raw_cleaned(df_raw, df_cleaned, features_list):
    """
    Plot subplots of the original distribution and after the application
    of some tyoe of transformation. The raw feature and the clened feature
    will be plotted side by side in a subplot

    Args:
        df_raw (pd.DataFrame): dataframe with raw data
        df_cleaned (pd.DataFramwe): dataframe with cleaned data
        features_list (list): list of features to plot
    """
    for feature in features_list:
        fig, axs = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 5))
        sns.histplot(df_raw[feature], ax = axs[0], stat = 'frequency')
        axs[0].set_title(f'Raw {feature}')
        sns.histplot(df_cleaned[feature], ax = axs[1], stat = 'frequency')
        axs[1].set_title(f'Cleaned {feature}')

In [533]:
#feature_outliers_list = [feature for feature in num_outliers.keys() if num_outliers[feature] > 0] #list of features with at least 1 outlier
#plot_comparisson_raw_cleaned(x_train, x_train_no_outliers, feature_outliers_list)

In [534]:
# The removal of outliers resulted in only one unique value in three features:
#cons_gas_12m
#forecast_discount_energy
#np_prod_act
# Therefore the outlier treatment will not be applied to these features
del feature_outliers_dict['cons_gas_12m']
del feature_outliers_dict['forecast_discount_energy']
del feature_outliers_dict['nb_prod_act']
feature_outliers_dict

{'cons_12m': {'max_thresh': np.float64(94390.25),
  'min_thresh': np.float64(-47659.75)},
 'cons_last_month': {'max_thresh': np.float64(8587.5),
  'min_thresh': np.float64(-5152.5)},
 'forecast_cons_12m': {'max_thresh': np.float64(5277.94625),
  'min_thresh': np.float64(-2385.42375)},
 'forecast_cons_year': {'max_thresh': np.float64(4422.5),
  'min_thresh': np.float64(-2653.5)},
 'forecast_meter_rent_12m': {'max_thresh': np.float64(303.28),
  'min_thresh': np.float64(-156.07999999999998)},
 'forecast_price_energy_off_peak': {'max_thresh': np.float64(0.21129141503340987),
  'min_thresh': np.float64(0.06323930348764449)},
 'forecast_price_energy_peak': {'max_thresh': np.float64(0.1975548186640039),
  'min_thresh': np.float64(-0.09625389637711587)},
 'forecast_price_pow_off_peak': {'max_thresh': np.float64(49.8683934),
  'min_thresh': np.float64(35.04968556)},
 'imp_cons': {'max_thresh': np.float64(489.9875),
  'min_thresh': np.float64(-293.9925)},
 'margin_gross_pow_ele': {'max_thresh': 

In [535]:
x_train_no_outliers = replace_outliers(x_train, feature_outliers_dict, verbose = True)
x_test_no_outliers  = replace_outliers(x_test, feature_outliers_dict, verbose = False)

Number of outliers in cons_12m: 1646
Number of outliers in cons_last_month: 1641
Number of outliers in forecast_cons_12m: 840
Number of outliers in forecast_cons_year: 1034
Number of outliers in forecast_meter_rent_12m: 124
Number of outliers in forecast_price_energy_off_peak: 179
Number of outliers in forecast_price_energy_peak: 0
Number of outliers in forecast_price_pow_off_peak: 289


Number of outliers in imp_cons: 974
Number of outliers in margin_gross_pow_ele: 532
Number of outliers in margin_net_pow_ele: 532
Number of outliers in net_margin: 810
Number of outliers in num_years_antig: 309
Number of outliers in pow_max: 1243
Number of outliers in contract_length: 296
Number of outliers in days_until_end: 0
Number of outliers in days_since_modification: 112
Number of outliers in mean_price_off_peak_var: 167
Number of outliers in mean_price_peak_var: 0
Number of outliers in mean_price_mid_peak_var: 0
Number of outliers in mean_price_off_peak_fix: 311
Number of outliers in mean_price_peak_fix: 0
Number of outliers in mean_price_mid_peak_fix: 0
Number of outliers in mean_energy_price: 17
Number of outliers in mean_potency_price: 84
Number of outliers in energy_peak_minus_offpeak: 14
Number of outliers in potency_peak_minus_offpeak: 0
Number of outliers in last_price_off_peak_var: 193
Number of outliers in last_price_peak_var: 0
Number of outliers in last_price_mid_pea

In [536]:
#plot_comparisson_raw_cleaned(x_train, x_train_no_outliers, feature_outliers_dict)

### Skewness


In [537]:
univariate_stats_train_data_no_outliers = univariate_statistics(x_train_no_outliers)
univariate_stats_train_data_no_outliers['skew'] = univariate_stats_train_data_no_outliers['skew'].mask(univariate_stats_train_data_no_outliers['skew'] == '-', np.nan)
univariate_stats_train_data_no_outliers.sort_values(by = 'skew', ascending = False)

Unnamed: 0_level_0,type,count,missing,unique,mode,min_value,q_1,median,q_3,max_value,mean,std,skew,kurtosis
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cons_gas_12m,int64,11684,0,1716,0,0,0.0,0.0,0.0,4154590,27843.623,163052.758,10.009,139.96
nb_prod_act,int64,11684,0,10,1,1,1.0,1.0,1.0,32,1.293,0.721,9.622,299.854
forecast_discount_energy,float64,11684,0,12,0.0,0.0,0.0,0.0,0.0,30.0,0.957,5.075,5.179,25.116
difference_price_peak_var,float64,11684,0,977,0.0,-0.034,-0.002904,0.0,0.0,0.03209,-0.001,0.006,2.134,23.732
difference_price_mid_peak_fix,float64,11684,0,10,0.0,-0.098,0.0,0.0,0.065166,0.162916,0.021,0.04,1.65,3.72
difference_price_peak_fix,float64,11684,0,12,0.0,-0.147,0.0,0.0,0.097749,0.244373,0.031,0.06,1.615,3.507
std_price_mid_peak_fix,float64,11684,0,96,0.0,0.0,0.0,0.0,0.025366,0.063415,0.011,0.018,1.579,1.483
std_price_peak_fix,float64,11684,0,99,0.0,0.0,0.0,0.0,0.038049,0.095124,0.016,0.028,1.553,1.379
std_price_mid_peak_var,float64,11684,0,1188,0.0,0.0,0.0,0.0,0.000845,0.002113,0.0,0.001,1.27,0.175
forecast_cons_year,float64,11684,0,2788,0.0,0.0,0.0,321.0,1769.0,4422.5,1091.727,1475.321,1.239,0.154


In [538]:
#The transformations will be applied to features with skew > 0.5 or skew < -0.5
skewed_features = univariate_stats_train_data_no_outliers.loc[abs(univariate_stats_train_data_no_outliers['skew']) > 0.5].index
skewed_features

Index(['cons_12m', 'cons_gas_12m', 'cons_last_month', 'forecast_cons_12m',
       'forecast_cons_year', 'forecast_discount_energy',
       'forecast_meter_rent_12m', 'imp_cons', 'margin_gross_pow_ele',
       'margin_net_pow_ele', 'nb_prod_act', 'net_margin', 'num_years_antig',
       'pow_max', 'contract_length', 'days_since_modification',
       'mean_price_peak_fix', 'mean_price_mid_peak_fix',
       'last_price_mid_peak_var', 'last_price_peak_fix',
       'last_price_mid_peak_fix', 'difference_price_peak_var',
       'difference_price_mid_peak_var', 'difference_price_off_peak_fix',
       'difference_price_peak_fix', 'difference_price_mid_peak_fix',
       'price_peak_var_slope', 'price_mid_peak_var_slope',
       'price_off_peak_fix_slope', 'price_peak_fix_slope',
       'price_mid_peak_fix_slope', 'std_price_off_peak_var',
       'std_price_peak_var', 'std_price_mid_peak_var',
       'std_price_off_peak_fix', 'std_price_peak_fix',
       'std_price_mid_peak_fix'],
      dtype='ob

In [539]:
#Identify the transformation that reduces the skewness the most
applied_transformations = {}
for feature in skewed_features:
    x_train_cleaned, _, applied_transformations[feature] = correct_skew(x_train_no_outliers, feature)


2025-06-12 12:50:13,080 - INFO - Removed 0 rows with NaN in 'cons_12m
2025-06-12 12:50:13,088 - INFO - Subsampled to 5000 rows from 11684
2025-06-12 12:50:13,090 - INFO - Initial Skewness: 1.1713
2025-06-12 12:50:13,130 - INFO - Best transformation: yeo
2025-06-12 12:50:13,131 - INFO - Skew after transformation: -0.02061988763443861
2025-06-12 12:50:13,131 - INFO - Transformation successful
2025-06-12 12:50:13,160 - INFO - Removed 0 rows with NaN in 'cons_gas_12m
2025-06-12 12:50:13,163 - INFO - Subsampled to 5000 rows from 11684
2025-06-12 12:50:13,163 - INFO - Initial Skewness: 10.0092
2025-06-12 12:50:13,194 - INFO - Best transformation: yeo
2025-06-12 12:50:13,194 - INFO - Skew after transformation: 1.7100335464388428
2025-06-12 12:50:13,195 - INFO - Transformation failed. Converting to binary
2025-06-12 12:50:13,195 - INFO - Could not reduce skewness below threshold. Converting 'cons_gas_12m' to binary.
2025-06-12 12:50:13,203 - INFO - Removed 0 rows with NaN in 'cons_last_month
2

In [540]:
applied_transformations

{'cons_12m': 'yeo',
 'cons_gas_12m': 'Converted to binary',
 'cons_last_month': '0.287',
 'forecast_cons_12m': 'yeo',
 'forecast_cons_year': '0.2235',
 'forecast_discount_energy': 'Converted to binary',
 'forecast_meter_rent_12m': 'yeo',
 'imp_cons': '0.2513',
 'margin_gross_pow_ele': 'yeo',
 'margin_net_pow_ele': 'yeo',
 'nb_prod_act': 'Converted to binary',
 'net_margin': 'yeo',
 'num_years_antig': '0.126',
 'pow_max': 'yeo',
 'contract_length': '0.1439',
 'days_since_modification': 'yeo',
 'mean_price_peak_fix': 'Converted to binary',
 'mean_price_mid_peak_fix': 'Converted to binary',
 'last_price_mid_peak_var': 'Converted to binary',
 'last_price_peak_fix': 'Converted to binary',
 'last_price_mid_peak_fix': 'Converted to binary',
 'difference_price_peak_var': 'Converted to binary',
 'difference_price_mid_peak_var': 'yeo',
 'difference_price_off_peak_fix': 'yeo',
 'difference_price_peak_fix': 'Converted to binary',
 'difference_price_mid_peak_fix': 'Converted to binary',
 'price_pea

In [541]:
# if feature has negative values and the transformation is raising to 1/n, apply Yeo-Johnson
for feature in applied_transformations:
    if applied_transformations[feature] != 'Converted to binary' and applied_transformations[feature] != 'yeo':
        if x_train_no_outliers[feature].min() < 0:
            applied_transformations[feature] = 'yeo'


In [542]:
#Features to be transformed using the Yeo-Johnson transformation.
features_yeo = []
for feature in applied_transformations:
    if applied_transformations[feature] == 'yeo':
        features_yeo.append(feature)
features_yeo


['cons_12m',
 'forecast_cons_12m',
 'forecast_meter_rent_12m',
 'margin_gross_pow_ele',
 'margin_net_pow_ele',
 'net_margin',
 'pow_max',
 'days_since_modification',
 'difference_price_mid_peak_var',
 'difference_price_off_peak_fix',
 'price_peak_var_slope',
 'price_mid_peak_var_slope',
 'price_off_peak_fix_slope',
 'price_peak_fix_slope',
 'price_mid_peak_fix_slope']

In [543]:
#Features that will be converted to binary
features_convert_binary = []
for feature in applied_transformations:
    if applied_transformations[feature] == 'Converted to binary':
        features_convert_binary.append(feature)
features_convert_binary

['cons_gas_12m',
 'forecast_discount_energy',
 'nb_prod_act',
 'mean_price_peak_fix',
 'mean_price_mid_peak_fix',
 'last_price_mid_peak_var',
 'last_price_peak_fix',
 'last_price_mid_peak_fix',
 'difference_price_peak_var',
 'difference_price_peak_fix',
 'difference_price_mid_peak_fix',
 'std_price_mid_peak_var',
 'std_price_peak_fix',
 'std_price_mid_peak_fix']

In [544]:
#features that will undergo a power transformation
features_power_transformation_dict = {}
for feature in applied_transformations:
    if applied_transformations[feature] != 'Converted to binary' and applied_transformations[feature] != 'yeo':
        features_power_transformation_dict[feature] = applied_transformations[feature]
features_power_transformation_list = list(features_power_transformation_dict.keys())
features_power_transformation_dict


{'cons_last_month': '0.287',
 'forecast_cons_year': '0.2235',
 'imp_cons': '0.2513',
 'num_years_antig': '0.126',
 'contract_length': '0.1439',
 'std_price_off_peak_var': '0.5001',
 'std_price_peak_var': '0.3345',
 'std_price_off_peak_fix': '0.6645'}

In [545]:
class FeaturePowerTransformer(BaseEstimator, TransformerMixin):

    """
    Applies feature-wise power transformations to specified columns.

    Each feature in the input DataFrame is raised to the power specified
    in the `power_dict`. This is useful for reducing skewness or applying
    manual nonlinear transformations based on prior optimization.

    Parameters
    ----------
    power_dict : dict
        A dictionary mapping feature names (str) to power values (float).
        Each feature in the dictionary will be transformed as:
        X[feature] = X[feature] ** power.

    Attributes
    ----------
    power_dict_ : dict
        Copy of the power dictionary used for transformation.

    Examples
    --------
    >>> power_dict = {'feature1': 0.5, 'feature2': -1}
    >>> transformer = FeaturePowerTransformer(power_dict=power_dict)
    >>> X_transformed = transformer.fit_transform(X)
    """
        
    
    
    def __init__(self, power_dict):
        self.power_dict = power_dict

    def fit(self, X, y=None):
        """No fitting necessary; returns self."""
        return self

    def transform(self, X):
        """
        Apply the power transformation to the specified features.

        Parameters
        ----------
        X : pandas.DataFrame
            Input DataFrame containing the features to transform.

        Returns
        -------
        X_transformed : pandas.DataFrame
            Transformed DataFrame with specified features raised to their powers.
        """
        X_transformed = X.copy()
        for feature, power in self.power_dict.items():
            power = float(power)          
            X_transformed[feature] = np.power(X_transformed[feature], power)
        return X_transformed

In [546]:
class convert_to_binary(BaseEstimator, TransformerMixin):

    """
    Converts multiple features into binary format based on their skewness.

    For each feature:
    - If skew > 0: values are 1 if != min, else 0
    - If skew <= 0: values are 1 if == max, else 0

    Parameters
    ----------
    features : list of str
        List of feature names to transform.

    Attributes
    ----------
    min_values_ : dict
        Minimum values for each feature.

    max_values_ : dict
        Maximum values for each feature.
    """

    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        
        if not isinstance(X, pd.DataFrame):
           raise ValueError("X must be a pandas DataFrame.")
        missing_features = [feature for feature in self.features if feature not in X.columns]
        if len(missing_features) > 0:
            raise ValueError(f"Missing features: {', '.join(missing_features)}")

        self.min_values_ = {feature: X[feature].min() for feature in self.features}
        self.max_values_ = {feature: X[feature].max() for feature in self.features}
        return self
        

    def transform(self, X):
        check_is_fitted(self) # looks for learned attributes (with trailing _)
        X_transformed = X.copy()
        for f in self.features:
            skew = X_transformed[f].skew()
            if skew > 0 :
                X_transformed[f] = (X_transformed[f] != self.min_values_[f]).astype(int)
            else:
                X_transformed[f] = (X_transformed[f] != self.max_values_[f]).astype(int)

        return X_transformed

       

In [547]:
skew_transformers = ColumnTransformer(
    transformers=[
    ('yeo', PowerTransformer(method = 'yeo-johnson'), features_yeo),
    ('convert_binary', convert_to_binary(features_convert_binary), features_convert_binary),
    ('power', FeaturePowerTransformer(features_power_transformation_dict), features_power_transformation_list)],
    remainder='passthrough',
    verbose_feature_names_out = False
)

In [548]:
# Apply transformer

x_train_cleaned = skew_transformers.fit_transform(x_train_no_outliers)
x_test_cleaned = skew_transformers.transform(x_test_no_outliers)


In [549]:
'''
plot_comparisson_raw_cleaned(x_train_no_outliers, x_train_cleaned, 
                        features_yeo + features_power_transformation_list)
'''

'\nplot_comparisson_raw_cleaned(x_train_no_outliers, x_train_cleaned, \n                        features_yeo + features_power_transformation_list)\n'

In [550]:
univariate_stats_train_data_skew_correction = univariate_statistics(x_train_cleaned)
univariate_stats_train_data_skew_correction['skew'] = univariate_stats_train_data_skew_correction['skew'].mask(univariate_stats_train_data_skew_correction['skew'] == '-', np.nan)
univariate_stats_train_data_skew_correction.sort_values(by = 'skew', ascending = False)

Unnamed: 0_level_0,type,count,missing,unique,mode,min_value,q_1,median,q_3,max_value,mean,std,skew,kurtosis
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
forecast_discount_energy,int64,11684,0,2,0.0,0.0,0.0,0.0,0.0,1.0,0.035,0.183,5.074,23.754
cons_gas_12m,int64,11684,0,2,0.0,0.0,0.0,0.0,0.0,1.0,0.179,0.384,1.672,0.796
nb_prod_act,int64,11684,0,2,0.0,0.0,0.0,0.0,0.0,1.0,0.218,0.413,1.367,-0.132
std_price_peak_fix,int64,11684,0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.336,0.472,0.694,-1.519
std_price_mid_peak_fix,int64,11684,0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.336,0.472,0.694,-1.519
mean_price_mid_peak_var,float64,11684,0,1633,0.0,0.0,0.0,0.0,0.072783,0.102951,0.028,0.036,0.495,-1.722
last_price_mid_peak_fix,int64,11684,0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.38,0.485,0.494,-1.757
last_price_peak_fix,int64,11684,0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.38,0.485,0.494,-1.757
potency_peak_minus_offpeak,float64,11684,0,659,-16.291555,-59.286,-44.35582,-44.26693,-16.275263,0.0,-33.483,13.81,0.491,-1.616
last_price_mid_peak_var,int64,11684,0,2,0.0,0.0,0.0,0.0,1.0,1.0,0.381,0.486,0.49,-1.761


### Scaling

In [551]:
scaler = StandardScaler()


In [552]:
scaled_features = x_train_cleaned.select_dtypes(include = ['number']).columns.tolist()
scaled_features = [feature for feature in scaled_features if feature not in features_convert_binary]

In [553]:
scaler.fit(x_train_cleaned[scaled_features])

In [554]:
x_train_scaled = scaler.transform(x_train_cleaned[scaled_features])
x_test_scaled = scaler.transform(x_test_cleaned[scaled_features])

In [555]:
x_train_cleaned[scaled_features] = x_train_scaled
x_test_cleaned[scaled_features] = x_test_scaled

### Encoding Binary and Categorical Features

In [556]:
#The features converted to binary and the categorical features will be encoded

encoded_features = x_train_cleaned.select_dtypes(include = ['object', 'category']).columns.tolist()
encoded_features += features_convert_binary
encoded_features

['channel_sales',
 'has_gas',
 'origin_up',
 'cons_gas_12m',
 'forecast_discount_energy',
 'nb_prod_act',
 'mean_price_peak_fix',
 'mean_price_mid_peak_fix',
 'last_price_mid_peak_var',
 'last_price_peak_fix',
 'last_price_mid_peak_fix',
 'difference_price_peak_var',
 'difference_price_peak_fix',
 'difference_price_mid_peak_fix',
 'std_price_mid_peak_var',
 'std_price_peak_fix',
 'std_price_mid_peak_fix']

In [557]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
x_train_encoded = encoder.fit_transform(x_train_cleaned[encoded_features])
x_test_encoded = encoder.transform(x_test_cleaned[encoded_features])
x_train_cleaned = x_train_cleaned.drop(columns = encoded_features)
x_test_cleaned = x_test_cleaned.drop(columns = encoded_features)
x_train_cleaned = pd.concat([x_train_cleaned, pd.DataFrame(x_train_encoded, columns = encoder.get_feature_names_out())], axis = 1)
x_test_cleaned = pd.concat([x_test_cleaned, pd.DataFrame(x_test_encoded, columns = encoder.get_feature_names_out())], axis = 1)

### Saving DataFrames

In [558]:
x_train_cleaned.to_csv('../data/cleaned/train/x_train_cleaned.csv', index = False)
y_train.to_csv('../data/cleaned/train/y_train.csv', index = False)

In [559]:
x_test_cleaned.to_csv('../data/cleaned/test/x_test_cleaned.csv', index = False)
y_test.to_csv('../data/cleaned/test/y_test.csv', index = False)

# Pipeline

In [560]:
class LoggingConfigurator(BaseEstimator, TransformerMixin):
    def __init__(self, log_file_name = 'pipeline.log', level = logging.INFO):
        self.log_file_name = log_file_name
        self.level = level
        
    def fit(self, X, y=None):
        configure_logging(log_file_name=self.log_file_name, level=self.level)
        return self

    def transform(self, X):
        return X  # Do nothing, just pass the data through

In [561]:
class DropMissing(BaseEstimator, TransformerMixin):
    def __init__(self, axis = 0):
        self.axis = axis

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        logging.info("Dropping missing values...")
        n_rows_initial, n_cols_initial = X.shape[0], X.shape[1]
        X.dropna(axis = self.axis)
        n_rows_final, n_cols_final = X.shape[0], X.shape[1]

        if n_rows_initial != n_rows_final:
            print(f"Removed {n_rows_initial - n_rows_final} rows with missing values.")
        
        if n_cols_initial != n_cols_final:
            print(f"Removed {n_cols_initial - n_cols_final} columns with missing values.")

        if n_rows_initial == n_rows_final and n_cols_initial == n_cols_final:
            logging.info("No missing values found.")

        return X 

In [562]:
class DateParserTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, date_columns, standard_format = '%Y-%m-%d'):
        self.date_columns = date_columns
        self.standard_format = standard_format  

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        logging.info("Parsing and formatting dates...")

        def parse_and_format(date_string):
            formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%d %b %Y", "%B %d, %Y"]
            if not isinstance(date_string, str):
                return pd.Nat
            for fmt in formats:
                try:
                    return datetime.strptime(date_string, fmt).strftime(self.standard_format)
                except ValueError:
                    continue
            return pd.NaT

        for col in self.date_columns:
            X_copy[col] = X_copy[col].apply(parse_and_format)
        return X_copy       

    

In [563]:
class CreatingDateFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, 
                date_columns, 
                reference_date = '2020-01-01', 
                start_date_feature = 'date_activ', 
                final_date_feature = 'date_end', 
                renewal_date_feature = 'date_renewal',
                modification_date_feature = 'date_modif_prod',
                drop_original_date_features = True):

        self.date_columns = date_columns
        self.reference_date = pd.to_datetime(reference_date)
        self.start_date_feature = start_date_feature
        self.final_date_feature = final_date_feature
        self.modification_date_feature = modification_date_feature
        self.renewal_date_feature = renewal_date_feature
        self.drop_original_date_features = drop_original_date_features

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        X_copy = X.copy()

        logging.info("Creating date features...")
        # Ensure datetime conversion
        for col in [self.start_date_feature, 
                    self.final_date_feature, 
                    self.modification_date_feature,
                    self.renewal_date_feature]:
            X_copy[col] = pd.to_datetime(X_copy[col], errors='coerce')


        X_copy['contract_length'] = (X_copy[self.final_date_feature] - X_copy[self.start_date_feature]).dt.days
        X_copy['days_until_end'] = (self.reference_date - X_copy[self.final_date_feature]).dt.days
        X_copy['days_until_renewal'] = (self.reference_date - X_copy[self.renewal_date_feature]).dt.days
        X_copy['days_since_modification'] = (self.reference_date - X_copy[self.modification_date_feature]).dt.days

        if self.drop_original_date_features:
            X_copy = X_copy.drop(columns = self.date_columns)

        if 'contract_length' in X_copy.columns:
            logging.info("contract_length feature created.")

        if 'days_until_end' in X_copy.columns:
            logging.info("days_until_end feature created.")

        if 'days_until_renewal' in X_copy.columns:
            logging.info("days_until_renewal feature created.")

        if 'days_since_modification' in X_copy.columns:
            logging.info("days_since_modification feature created.")
        

        return X_copy
        


In [564]:
numeric_columns_price = ['price_off_peak_var',
                        'price_peak_var',
                        'price_mid_peak_var',
                        'price_off_peak_fix',
                        'price_peak_fix',
                        'price_mid_peak_fix']

In [565]:
class TransformPricesTemporalSeries(BaseEstimator, TransformerMixin):
    def __init__(self, 
                price_columns,
                id_column = 'id',
                date_column = 'price_date'):
        self.price_columns = price_columns
        self.id_column = id_column
        self.date_column = date_column

    def fit(self, X, y = None):
        return self

    def calculating_mean_prices(self, X):
        X_mean = X.groupby(self.id_column)[self.price_columns].mean().reset_index()
        X_mean_columns = [self.id_column] + [f'mean_{col}' for col in X_mean.columns[1:]]
        X_mean.columns = X_mean_columns

        energy_features = [feature for feature in X_mean_columns if 'var' in feature]
        potency_features = [feature for feature in X_mean_columns if 'fix' in feature]

        X_mean['mean_energy_price'] = X_mean[energy_features].mean(axis = 1)
        X_mean['mean_potency_price'] = X_mean[potency_features].mean(axis = 1)

        X_mean['energy_peak_minus_offpeak']  = X_mean['mean_price_peak_var'] - X_mean['mean_price_off_peak_var']
        X_mean['potency_peak_minus_offpeak'] = X_mean['mean_price_peak_fix'] - X_mean['mean_price_off_peak_fix']

        return X_mean

    def last_prices(self, X):

        last_price = X.loc[X.groupby(self.id_column)[self.date_column].idxmax()]
        last_price = last_price.reset_index(drop=True)
        last_price = last_price.drop(columns = [self.date_column])
        last_price.columns = ['id'] + [f'last_{col}' for col in last_price.columns[1:]] 
        
        return last_price

    def difference_last_first_prices(self, X):

        last = X.loc[X.groupby(self.id_column)[self.date_column].idxmax()].set_index(self.id_column, drop = True)
        last = last.drop(columns = [self.date_column])
        first = X.loc[X.groupby(self.id_column)[self.date_column].idxmin()].set_index(self.id_column, drop = True)
        first = first.drop(columns = [self.date_column])

        difference = last - first
        difference = difference.reset_index(drop = False)
        difference.columns = ['id'] + [f'difference_{col}' for col in difference.columns[1:]]

        return difference


    def calculating_slopes_linear_regression(self, X):

        agg_dict = {
        f'{col}_slope' : (col, lambda x: linregress(np.arange(len(x)), x).slope)
        for col in self.price_columns
        }

        slopes_regression = X.groupby(self.id_column).agg(**agg_dict)
        slopes_regression = slopes_regression.reset_index()
        slopes_regression.head()

        return slopes_regression

    def calculating_variance(self, X):

        std_prices = X.groupby(self.id_column)[self.price_columns].std()
        std_prices = std_prices.reset_index()
        std_columns = ['id'] + [f'std_{col}' for col in std_prices.columns[1:]]
        std_prices.columns = std_columns

        return std_prices


    def transform(self, X):
        X_copy = X.copy()

        message = "Calculating mean prices, last prices, "
        message += "difference between last and first price "
        message +=  "slopes and standard deviation of prices..."
        logging.info(message)

        X_mean = self.calculating_mean_prices(X_copy)
        X_last = self.last_prices(X_copy)
        X_difference = self.difference_last_first_prices(X_copy)
        X_slope = self.calculating_slopes_linear_regression(X_copy)
        X_std = self.calculating_variance(X_copy)

        df_list = [X_mean, X_last, X_difference, X_slope, X_std]
        
        prices_df = reduce(lambda left, right: pd.merge(left, right, on='id', how='inner'), df_list)

        return prices_df


 

In [566]:
pipeline_client_data = Pipeline(
    [('parse_dates', DateParserTransformer(date_columns = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal'])),
     ('remove_missing', DropMissing()),
     ('create_date_features', CreatingDateFeatures(date_columns = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']))]
)

In [567]:
pipeline_price_series = Pipeline(
    [('remove_missing', DropMissing()),
     ('transform_prices', TransformPricesTemporalSeries(price_columns = ['price_off_peak_var',
                                                                        'price_peak_var',
                                                                        'price_mid_peak_var',
                                                                        'price_off_peak_fix',
                                                                        'price_peak_fix',
                                                                     'price_mid_peak_fix']))])

In [568]:
def merge_data(X):

    df1, df2 = X
    df1_preprocessed = pipeline_client_data.fit_transform(df1)
    df2_preprocessed = pipeline_price_series.fit_transform(df2)

    #Convert to dataframe if necessary
    if not isinstance(df1_preprocessed, pd.DataFrame):
        df1_preprocessed = pd.DataFrame(df1_preprocessed, index = df1.index)

    if not isinstance(df2_preprocessed, pd.DataFrame):
        df2_preprocessed = pd.DataFrame(df2_preprocessed, index = df2.index)
    
    logging.info("Merging dataframes...")
    merged = pd.merge(
        df1_preprocessed,
        df2_preprocessed,
        on='id',
        how='inner'
    )

    return merged


In [577]:
class ReplaceOutliers(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_outliers_dict_ = {}

        
    def fit(self, X, y = None):
        for feature in X.columns:
            if pd.api.types.is_numeric_dtype(X[feature]):
                min_thresh, max_thresh = calculate_outlier_threshold(x_train, feature)
                self.feature_outliers_dict_[feature] = {'max_thresh': max_thresh, 'min_thresh': min_thresh}
        return self

    def transform(self, X):
        X_copy = X.copy()
        logging.info("Replacing outliers...")
        for feature in self.feature_outliers_dict_:
            num_outliers = (X_copy[feature] > self.feature_outliers_dict_[feature]['max_thresh']).sum()
            num_outliers += (X_copy[feature] < self.feature_outliers_dict_[feature]['min_thresh']).sum()
            logging.info(f'Number of outliers in {feature}: {num_outliers}') 

            X_copy[feature] = X_copy[feature].mask(X_copy[feature] > self.feature_outliers_dict_[feature]['max_thresh'], 
                                        self.feature_outliers_dict_[feature]['max_thresh'])
            X_copy[feature] = X_copy[feature].mask(X_copy[feature] < self.feature_outliers_dict_[feature]['min_thresh'], 
                                        self.feature_outliers_dict_[feature]['min_thresh'])
        
        return X_copy


In [None]:
class ReduceSkew(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold = 0.5):
        self.best_transformation_ = {}
        self.skew_threshold = skew_threshold
        self.min_max_values = {}
        self.yeo_transformer = PowerTransformer(method = 'yeo-johnson')
        
    def fit(self, X, y = None):
        X_copy = X.copy()
        self.yeo_features = []
        for feature in X_copy.columns:
            if pd.api.types.is_numeric_dtype(X_copy[feature]):
                skew = X_copy[feature].skew()
                if abs(skew) > self.skew_threshold:
                    _, _, self.best_transformation_[feature] = correct_skew(X_copy, feature)
                    if self.best_transformation_[feature] != 'Converted to binary' and self.best_transformation_[feature] != 'yeo':
                        if X_copy[feature].min() < 0:
                            self.best_transformation_[feature] = 'yeo'
                    if self.best_transformation_[feature] == 'Converted to binary':
                            self.min_max_values[feature]['max_value'] = X_copy.max()
                            self.min_max_values[feature]['min_value'] = X_copy.min()
                    if self.best_transformation_[feature] != 'yeo':
                        self.yeo_features.append(feature)
        
        self.yeo_transformer.fit(X_copy[self.yeo_features])

        return self

    def power_tranformer(self, X, feature, power):
        power = float(power)
        X[feature] = np.power(X[feature], power)
        return X

    def convert_binary(self, X, feature):
        skew = X[feature].skew()
        min_value = self.min_max_values[feature]['min_value']
        max_Value = self.min_max_values[feature]['max_value'] 
    
        if skew > 0:
            X[feature] = (X[feature] != min_value).astype(int)
        else:
            X[feature] = (X[feature] != max_value).astype(int)
        
        return X

    def yeo_transformer(self, X):

        X[self.yeo_features] = self.yeo_transformer.transform(X[self.yeo_features])
        return X
        



    def transform(self, X):
        for feature in self.best_transformation_:
            if self.self.best_transformation_[feature] == 'Converted to binary':
                X = self.convert_binary(X, feature)

            if self.best_transformation_[feature] != 'Converted to binary' and self.best_transformation_[feature] != 'yeo':
                X= self.power_tranformer(X, feature, self.best_transformation_[feature])

            X = self.yeo_transformer(X)
        
        return X_copy

In [578]:
pipeline = Pipeline([
    ("logging", LoggingConfigurator(log_file_name="my_pipeline.log")),
    ('merge', FunctionTransformer(merge_data, validate = False)),
    ('basic wrangling', FunctionTransformer(basic_wrangling)),
    ('replacing outliers',ReplaceOutliers())
])

In [579]:
y_train = raw_client_data_train['churn']
raw_client_data_train = raw_client_data.drop(columns = ['churn'])

y_test = raw_client_data_test['churn']
raw_client_data_test = raw_client_data_test.drop(columns = ['churn'])

KeyError: 'churn'

In [580]:
x_train = pipeline.fit_transform((raw_client_data_train, raw_price_data_train))


2025-06-12 12:52:12,334 - INFO - Parsing and formatting dates...
2025-06-12 12:52:12,773 - INFO - Dropping missing values...
2025-06-12 12:52:12,781 - INFO - No missing values found.
2025-06-12 12:52:12,783 - INFO - Creating date features...
2025-06-12 12:52:12,803 - INFO - contract_length feature created.
2025-06-12 12:52:12,804 - INFO - days_until_end feature created.
2025-06-12 12:52:12,805 - INFO - days_until_renewal feature created.
2025-06-12 12:52:12,806 - INFO - days_since_modification feature created.
2025-06-12 12:52:12,811 - INFO - Dropping missing values...
2025-06-12 12:52:12,829 - INFO - No missing values found.
2025-06-12 12:52:12,831 - INFO - Calculating mean prices, last prices, difference between last and first price slopes and standard deviation of prices...
2025-06-12 12:52:23,043 - INFO - Merging dataframes...
2025-06-12 12:52:23,062 - INFO - Starting data wrangling...
2025-06-12 12:52:23,066 - INFO - Feature id removed. The proportion of unique values in the featu

In [None]:
'''
class DataFrameMerger(BaseEstimator, TransformerMixin):
    
    def __init__(self, pipeline1, pipeline2, merge_on = 'id', how_to_merge = 'inner'):
        self.pipeline1 = pipeline1
        self.pipeline2 = pipeline2
        self.merge_on = merge_on
        self.how_to_merge = how_to_merge
        

    def fit(self, X, y=None):
        df1, df2 = X
        self.pipeline1.fit(df1)
        self.pipeline2.fit(df2)
        return self

    def transform(self, X):
        df1, df2 = X
        df1_processed = self.pipeline1.transform(df1)
        df2_processed = self.pipeline2.transform(df2)

        #Convert to dataframe if necessary
        if not isinstance(df1_processed, pd.DataFrame):
            df1_processed = pd.DataFrame(df1_processed, index = df1.index)

        if not isinstance(df2_processed, pd.DataFrame):
            df2_processed = pd.DataFrame(df2_processed, index = df2.index)

        if self.merge_on not in df1.columns or self.merge_on not in df2.columns:
            raise ValueError(f"Merge column {self.merge_on} not found in either df1 or df2.")
            
        # Reattach the 'on' column if pipelines removed it
        if self.merge_on not in df1_processed.columns:
            df1_processed[self.merge_on] = df1[self.merge_on]

        if self.merge_on not in df2_processed.columns:
            df2_processed[self.merge_on] = df2[self.merge_on]
        
        merged_df = pd.merge(df1_processed, 
                    df2_processed, 
                    on = self.merge_on, 
                    how = self.how_to_merge)

        return merged_df
'''

In [180]:
pipeline = Pipeline([('merge', DataFrameMerger(pipeline_client_data, pipeline_price_series))])

In [182]:
pipeline.fit((raw_client_data, raw_price_data))

In [183]:
teste = pipeline.transform((raw_client_data, raw_price_data))

