In [23]:
import sys
from pathlib import Path

# Get the folder where this notebook is running
notebook_path = Path().resolve()

# Assume 'src' is at the project root, one level up from the notebook folder
project_root = notebook_path.parent

# Construct the full path to src
src_path = project_root / "src"

# Add src to sys.path if not already there
if str(src_path) not in sys.path:
    sys.path.insert(1, str(src_path))

print(f"Added {src_path} to sys.path")

Added /home/rodolfo/Documents/Rodolfo/Python_Projects/Projetos com repositorio/churn-powerco/powerco-churn/src to sys.path


In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.gridspec as gridspec
import numpy as np
from scipy.stats import linregress
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn import set_config
from functools import reduce
import warnings


from powerco_churn.EDA.basic_data_wrangling import basic_wrangling
from powerco_churn.EDA.univariate_statistics import univariate_statistics, plot_histograms_countplots
from powerco_churn.EDA.bivariate_statistics import bivariate_stats
from powerco_churn.EDA.visualizing_bivariate_statistics import generate_bar_plot
from powerco_churn.EDA.outliers import calculate_outlier_threshold
from powerco_churn.EDA.skewness import correct_skew
from powerco_churn.EDA.date_utils import parse_and_format_dates

# Configuration

In [25]:
# plot will similiar to ggplot
plt.style.use('ggplot')

random_seed = 42

#set output of skllearn to be a pandas dataframe
set_config(transform_output = "default")


# Load Data

Two dataframes:

- client_data containing information about the clients (train and test)
- price_data containing the prices of power and energy during 2015

In [26]:
#train data
train_client_data = pd.read_csv('../data/raw/train/train_client_data.csv')

#test data
test_client_data  = pd.read_csv('../data/raw/test/test_client_data.csv')


In [27]:
price_data = pd.read_csv('../data/raw/price_data.csv')

# Client Data

## Feature Engineering

### Date Features

- There is already a feature that that indicates how long the client has been with the energy company;
- Creating a new feature that indicates the length of the active contract;
- How long will it take for the contract to end?
- How long since the last modification?

The date features will be used to create three new features and then will be removed

In [28]:
date_features = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']

In [29]:
for feature in date_features:
    train_client_data[feature] = train_client_data[feature].apply(parse_and_format_dates)
    test_client_data[feature] = test_client_data[feature].apply(parse_and_format_dates)

In [30]:
train_client_data['contract_length'] = (pd.to_datetime(train_client_data['date_end'])
                                    - pd.to_datetime(train_client_data['date_activ'])).dt.days

test_client_data['contract_length'] = (pd.to_datetime(test_client_data['date_end'])
                                    - pd.to_datetime(test_client_data['date_activ'])).dt.days


# a random date selected to be more recent than the last date in the dataset
reference_date = pd.to_datetime('2020-01-01')

train_client_data['days_until_end'] = (reference_date - pd.to_datetime(train_client_data['date_end'])).dt.days
train_client_data['days_since_modification'] = (reference_date - pd.to_datetime(train_client_data['date_modif_prod'])).dt.days

test_client_data['days_until_end'] = (reference_date - pd.to_datetime(test_client_data['date_end'])).dt.days
test_client_data['days_since_modification'] = (reference_date - pd.to_datetime(test_client_data['date_modif_prod'])).dt.days

In [31]:
train_client_data.drop(columns = date_features, inplace = True)
test_client_data.drop(columns = date_features, inplace = True)

## Data Preprocessing

In [36]:
uni_stats_client_data = univariate_statistics(train_client_data)
uni_stats_client_data


Unnamed: 0_level_0,type,count,missing,unique,mode,min_value,q_1,median,q_3,max_value,mean,std,skew,kurtosis
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
id,object,11684,0,11684,0002203ffbb812588b632b9e628cc38d,-,-,-,-,-,-,-,-,-
channel_sales,object,11684,0,8,foosdfpfkusacimwkcsosbicdxkicaua,-,-,-,-,-,-,-,-,-
cons_12m,int64,11684,0,9219,0,0,5609.0,14139.0,41121.5,6207104,159429.231,571752.758,6.029,43.573
cons_gas_12m,int64,11684,0,1716,0,0,0.0,0.0,0.0,4154590,27843.623,163052.758,10.009,139.96
cons_last_month,int64,11684,0,4225,0,0,0.0,813.0,3435.0,771203,16149.339,64088.349,6.378,47.921
forecast_cons_12m,float64,11684,0,11222,0.0,0.0,488.34,1099.43,2404.1825,82902.83,1868.707,2419.608,7.775,169.078
forecast_cons_year,int64,11684,0,3739,0,0,0.0,321.0,1769.0,175375,1426.479,3414.982,17.364,665.216
forecast_discount_energy,float64,11684,0,12,0.0,0.0,0.0,0.0,0.0,30.0,0.957,5.075,5.179,25.116
forecast_meter_rent_12m,float64,11684,0,3145,0.0,0.0,16.18,18.725,131.02,599.31,63.06,65.964,1.438,3.802
forecast_price_energy_off_peak,float64,11684,0,479,0.145711,0.0,0.11634,0.143166,0.146348,0.273963,0.137,0.025,-0.171,8.281


### Missing Values

In [None]:
#Trere are not missing values
train_client_data.isna().sum()

Unnamed: 0                        0
id                                0
channel_sales                     0
cons_12m                          0
cons_gas_12m                      0
cons_last_month                   0
date_activ                        0
date_end                          0
date_modif_prod                   0
date_renewal                      0
forecast_cons_12m                 0
forecast_cons_year                0
forecast_discount_energy          0
forecast_meter_rent_12m           0
forecast_price_energy_off_peak    0
forecast_price_energy_peak        0
forecast_price_pow_off_peak       0
has_gas                           0
imp_cons                          0
margin_gross_pow_ele              0
margin_net_pow_ele                0
nb_prod_act                       0
net_margin                        0
num_years_antig                   0
origin_up                         0
pow_max                           0
churn                             0
contract_length             

### Outliers