In [None]:
import pandas as pd

client_df = pd.read_csv('client_data.csv')  # make sure file is in same folder
price_df = pd.read_csv('price_data.csv')

client_df.head()


In [None]:
client_df.isnull().sum().sort_values(ascending=False)


In [None]:
date_cols = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']
client_df[date_cols] = client_df[date_cols].apply(pd.to_datetime)
client_df[date_cols].dtypes


In [None]:
client_df['channel_sales'].value_counts()
client_df['origin_up'].value_counts()
client_df['activity_new'].value_counts()


In [None]:
client_df.describe()


In [None]:
client_df.to_csv('cleaned_client_data.csv', index=False)



In [None]:
# Drop 'id' since it's just a unique label and doesn't help prediction
df = df.drop(columns=['id'])

# Find columns with only one unique value
one_value_cols = [col for col in df.columns if df[col].nunique() == 1]
print("Columns with only one unique value:", one_value_cols)

# Drop those columns too
df = df.drop(columns=one_value_cols)

# Check new shape of the data
print("Shape after dropping useless columns:", df.shape)


In [None]:
# Convert all date columns to datetime
date_cols = ['date_activ', 'date_end', 'date_renewal', 'date_modif_prod']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Extract year, month, day from each date column
for col in date_cols:
    df[f"{col}_year"] = df[col].dt.year
    df[f"{col}_month"] = df[col].dt.month
    df[f"{col}_day"] = df[col].dt.day

# Example feature: days between end and activation
df['contract_duration_days'] = (df['date_end'] - df['date_activ']).dt.days

# Example feature: days until renewal
df['days_until_renewal'] = (df['date_renewal'] - df['date_modif_prod']).dt.days

# Check new columns added
df[['contract_duration_days', 'days_until_renewal']].head()


In [None]:
# Average monthly consumption
df['avg_monthly_consumption'] = df['cons_12m'] / 12

# Difference between last month and average consumption
df['consumption_diff'] = df['cons_last_month'] - df['avg_monthly_consumption']

# Margin per product
df['margin_per_product'] = df['net_margin'] / df['nb_prod_act'].replace(0, 1)  # Avoid division by 0

# Total forecast cost (simplified)
df['forecast_total_cost'] = (
    df['forecast_meter_rent_12m'] +
    df['forecast_price_energy_off_peak'] +
    df['forecast_price_energy_peak'] +
    df['forecast_price_pow_off_peak']
)

# Check new features
df[['avg_monthly_consumption', 'consumption_diff', 'margin_per_product', 'forecast_total_cost']].head()


In [None]:
# (Optional) Drop original date columns to avoid duplicates
df = df.drop(columns=['date_activ', 'date_end', 'date_modif_prod', 'date_renewal'])

# Check final shape
print("✅ Final shape:", df.shape)

# Save your final feature-engineered dataset
df.to_csv("final_feature_engineered_data.csv", index=False)
print("✅ File saved as: final_feature_engineered_data.csv")


In [None]:
import pandas as pd

# Load the final dataset
df = pd.read_csv("data_for_predictions.csv")
df.head()


In [None]:
# Separate features and target variable
X = df.drop(columns=['churn'])  # All columns except churn
y = df['churn']  # Target column

# Optional: check class balance
print("Churn value counts:\n", y.value_counts())


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)


In [None]:
# Predict on test set
y_pred = rf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
### Why These Metrics?

We used Accuracy, Precision, Recall, and F1 Score:
- **Accuracy** tells how many overall predictions were correct.
- **Precision** tells how many predicted churns were actually churns.
- **Recall** tells how many actual churns were detected.
- **F1 Score** balances Precision and Recall — useful when classes are imbalanced.

### Is the Model Good?

The model performance depends on the use case. For churn prediction:
- **High Recall** is important: catching as many churn risks as possible.
- Our model achieves [insert scores] — which shows it’s a decent starting point.
- We can improve it further using parameter tuning, feature selection, etc.
