In [4]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\krish\Downloads\clean_data_after_eda.csv"  # Update this if needed
df = pd.read_csv(r"C:\Users\krish\Downloads\clean_data_after_eda.csv")

# Display basic info
print(df.info())  # Check column types and missing values
print(df.head())  # Preview first few rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              14606 non-null  object 
 1   channel_sales                   14606 non-null  object 
 2   cons_12m                        14606 non-null  int64  
 3   cons_gas_12m                    14606 non-null  int64  
 4   cons_last_month                 14606 non-null  int64  
 5   date_activ                      14606 non-null  object 
 6   date_end                        14606 non-null  object 
 7   date_modif_prod                 14606 non-null  object 
 8   date_renewal                    14606 non-null  object 
 9   forecast_cons_12m               14606 non-null  float64
 10  forecast_cons_year              14606 non-null  int64  
 11  forecast_discount_energy        14606 non-null  float64
 12  forecast_meter_rent_12m         

In [8]:
print(df.isnull().sum())
columns_to_drop =['date_activ', 'date_end', 'date_modif_prod', 'date_renewal', 'origin_up']  
df.drop(columns=columns_to_drop, axis=1, inplace=True)
print("Columns after dropping irrelevant ones:",df.columns)

id                                0
channel_sales                     0
cons_12m                          0
cons_gas_12m                      0
cons_last_month                   0
date_activ                        0
date_end                          0
date_modif_prod                   0
date_renewal                      0
forecast_cons_12m                 0
forecast_cons_year                0
forecast_discount_energy          0
forecast_meter_rent_12m           0
forecast_price_energy_off_peak    0
forecast_price_energy_peak        0
forecast_price_pow_off_peak       0
has_gas                           0
imp_cons                          0
margin_gross_pow_ele              0
margin_net_pow_ele                0
nb_prod_act                       0
net_margin                        0
num_years_antig                   0
origin_up                         0
pow_max                           0
var_year_price_off_peak_var       0
var_year_price_peak_var           0
var_year_price_mid_peak_var 

In [11]:
if 'date_activ' in df.columns:
    df['date_activ'] = pd.to_datetime(df['date_activ'])
    df['customer_tenure'] = (pd.to_datetime("today") - df['date_activ']).dt.days
# Prevent division by zero errors
df['cons_12m'] = df['cons_12m'].replace(0, 1)

# Consumption ratio (last month vs last 12 months)
df['cons_ratio'] = df['cons_last_month'] / df['cons_12m']

# Forecast error: difference between forecasted and actual consumption
df['forecast_error'] = df['forecast_cons_12m'] - df['cons_12m']
df.to_csv(r"C:\Users\krish\Downloads\clean_data_after_eda.csv", index=False)
print("Cleaned dataset saved successfully!")



Cleaned dataset saved successfully!


In [15]:

# 1️⃣ Average Monthly Consumption
df['avg_monthly_cons'] = df['cons_12m'] / 12

# 2️⃣ Ratio of Last Month's Consumption to Yearly Consumption
df['cons_ratio'] = df['cons_last_month'] / (df['cons_12m'] + 1)  # Avoid division by zero

# 3️⃣ Profitability per Product
df['profit_per_product'] = df['net_margin'] / (df['nb_prod_act'] + 1)  # Avoid division by zero

# 4️⃣ Forecasting Error (How different the forecasted vs actual consumption is)
df['forecast_error'] = abs(df['forecast_cons_12m'] - df['cons_12m'])

# 5️⃣ Binary Encoding for 'has_gas' (Convert Yes/No to 1/0)
df['has_gas'] = df['has_gas'].map({'Yes': 1, 'No': 0})

# 6️⃣ Price Stability (Variance in Peak & Off-Peak Prices Over 6 Months)
df['price_stability'] = df[['var_6m_price_off_peak_var', 'var_6m_price_peak_var', 'var_6m_price_mid_peak_var']].std(axis=1)

# 7️⃣ Customer Loyalty Feature (Longer years = More Loyal)
df['is_loyal'] = (df['num_years_antig'] > 5).astype(int)

# 8️⃣ Energy Discount Impact (If discount was applied)
df['discount_impact'] = df['forecast_discount_energy'] / (df['forecast_price_energy_peak'] + 1)  # Avoid division by zero


In [17]:
df.to_csv(r"C:\Users\krish\Downloads\feature_engineered_data.csv", index=False)
print("Feature engineering completed. Dataset saved!")


Feature engineering completed. Dataset saved!
