In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
df = pd.read_csv("C:/Users/Admin/Downloads/clean_data_after_eda.csv")

# Convert date columns to datetime
date_cols = ['date_activ', 'date_end']
df[date_cols] = df[date_cols].apply(pd.to_datetime, format='%Y-%m-%d')

# Create tenure feature
df['customer_tenure_days'] = (df['date_end'] - df['date_activ']).dt.days

# Drop unnecessary columns
drop_cols = ['id', 'date_modif_prod', 'date_renewal', 'date_activ', 'date_end']
df.drop(columns=drop_cols, inplace=True)

# Feature Engineering
# Consumption trends
df['consumption_ratio'] = df['cons_last_month'] / (df['cons_12m'] + 1)
df['forecast_deviation'] = df['forecast_cons_12m'] - df['cons_12m']

# Price sensitivity
df['price_peak_vs_offpeak'] = df['forecast_price_energy_peak'] / (df['forecast_price_energy_off_peak'] + 1)
df['avg_yearly_price_var'] = df[['var_year_price_off_peak', 'var_year_price_peak', 'var_year_price_mid_peak']].mean(axis=1)
df['avg_6m_price_var'] = df[['var_6m_price_off_peak', 'var_6m_price_peak', 'var_6m_price_mid_peak']].mean(axis=1)

# Profitability metrics
df['profitability_ratio'] = df['margin_net_pow_ele'] / (df['margin_gross_pow_ele'] + 1)
df['power_demand_per_product'] = df['pow_max'] * df['nb_prod_act']

# Handling missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.loc[:, 'channel_sales'] = df['channel_sales'].fillna('Unknown')
df.loc[:, 'origin_up'] = df['origin_up'].fillna('Unknown')

# Define features and target
X = df.drop(columns=['churn'])
y = df['churn']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the processed dataset
df.to_csv("C:/Users/Admin/Downloads/processed_data.csv", index=False)

print("Feature engineering and model training completed. Processed data saved.")


Model Performance:
Accuracy: 0.8997
Precision: 0.8750
Recall: 0.0459
F1 Score: 0.0872

Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      2617
           1       0.88      0.05      0.09       305

    accuracy                           0.90      2922
   macro avg       0.89      0.52      0.52      2922
weighted avg       0.90      0.90      0.86      2922

Feature engineering and model training completed. Processed data saved.
