In [4]:
#acquisition
#importing data from local repo
#Data Source: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

import pandas as pd
#load the dataset
#read the dataset

df = pd.read_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
#generate first five rows
df.head()

#basic information
df.info()

#data description
df.describe()

#shape of datasets(no. of rows and columns)
df.shape

#checking for missing 
df.isnull().sum()

#checking for duplicates
df.duplicated().sum()


#data preview of the targeted columns
df['Churn'].value_counts()

df['MonthlyCharges'].describe()

df['TotalCharges'].describe()

# Perform acquisition tasks
df.to_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/acquired_data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
#Cleaning

import pandas as pd
df = pd.read_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/acquired_data.csv')

# Perform cleaning tasks
df = df.dropna() 



# Drop the customerID column as it's not needed for prediction
df.drop('customerID', axis=1, inplace=True)

# Convert 'TotalCharges' to numeric, forcing errors to NaN (i.e., invalid data)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Drop rows where 'TotalCharges' is NaN (invalid values after conversion)
df.dropna(subset=['TotalCharges'], inplace=True)

# Replace any remaining spaces with NaN
df.replace(" ", pd.NA, inplace=True)

# Drop any duplicate rows
df.drop_duplicates(inplace=True)

# Reset the index after cleaning
df.reset_index(drop=True, inplace=True)

# Show the cleaned data
df.head()

#save the cleaned file
df.to_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/cleaned_data.csv', index=False)
df.to_csv("/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/final_dataset_with_clv.csv", index=False)



In [6]:
#feature Engineering
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the cleaned dataset (assuming it's already cleaned and saved in 'processed')
df = pd.read_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/cleaned_data.csv')

print(df.columns)

#Interaction terms(mmonthly charges and Tenure to capture how the cost and duration of the customers contract affect churn)
df['monthly_charges_tenure'] = df['MonthlyCharges'] * df['tenure']

#new features 'polynomila features generated by taking the existing features and raising them to a power or multiplying together,  to see non linear relationships

df['monthly_charges_squared'] = df['MonthlyCharges'] ** 2
df['total_charges_squared'] = df['TotalCharges'] ** 2

#categorical encoding(one hot encodings) to convert into numerical formats
df = pd.get_dummies(df, columns=['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                                 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                                 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                                 'PaperlessBilling', 'PaymentMethod'], drop_first=True)


#handling missing values
# Fill missing numerical columns with the median value
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)  # Replace empty strings with NaN
df['TotalCharges'] = df['TotalCharges'].astype(float)

#filled by median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


# Fill missing categorical columns with the mode
df['SeniorCitizen'] = df['SeniorCitizen'].fillna(df['SeniorCitizen'].mode()[0])

print(df.columns.tolist())

#feature grouping to group related features for easy analysis
#grouping streaming services and streamingMovies into single feature
df['has_streaming_services'] = ((df['StreamingTV_Yes'] == 1) | (df['StreamingMovies_Yes'] == 1)).astype(int)


#grouping for online securities and device protections
df['has_online_security_or_protection'] = (
    (df['OnlineSecurity_Yes'] == 1) | (df['DeviceProtection_Yes'] == 1)
).astype(int)


#scaling features
#ensure that no single feature dominates due to scale differences
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['MonthlyCharges', 'TotalCharges', 'tenure']] = scaler.fit_transform(df[['MonthlyCharges', 'TotalCharges', 'tenure']])

#Feature Engineering for customer segmentation
#use target column to divide the dataset into churned and non-churned customers.
df['tenure_segment'] = pd.cut(df['tenure'], bins=[0, 12, 24, 36, 48, np.inf], labels=['0-1 year', '1-2 years', '2-3 years', '3-4 years', '4+ years'])

df['monthly_charge_segment'] = pd.cut(df['MonthlyCharges'], bins=[0, 30, 60, 90, np.inf], labels=['Low', 'Medium', 'High', 'Very High'])


#creating new target features for analysis
df['churn_contract'] = (df['Churn'] == 'Yes') & \
                       (df['Contract_One year'] == 0) & \
                       (df['Contract_Two year'] == 0)
df['churn_contract'] = df['churn_contract'].astype(int)



def create_clv_target(df, monetary_column='TotalCharges', time_column='tenure'):
    """
    Creates a simple CLV target variable using:
    CLV = Average Monthly Spend * Tenure
    """
    df['CLV'] = df[monetary_column] / df[time_column].replace(0, 1) * df[time_column]
    return df
df = create_clv_target(df)

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn', 'monthly_charges_tenure', 'monthly_charges_squared', 'total_charges_squared', 'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No 

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score


# Load your processed dataset (if you're reading from file)
# df = pd.read_csv('data/processed/your_file.csv')


df = pd.read_csv('/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/cleaned_data.csv')

# 1. Feature-target split
X = df.drop('Churn', axis=1)
y = df['Churn']




# 2. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Save test sets
X_test.to_csv("/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/X_test.csv", index=False)
y_test.to_csv("/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/y_test.csv", index=False)

#check for object(non- numeric column)
print(X_train.dtypes[X_train.dtypes == 'object'])

#convert to numeric columns
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Align test set columns with train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Save the training columns (feature names) so you can align test data later
pd.Series(X_train.columns).to_csv(
    '/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/X_train_columns.csv',
    index=False,
    header=False
)




# 3. Scaling (Optional but good for models like Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Model Training

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)  # RF doesn't need scaling




# 5. Evaluation
print("=== Logistic Regression ===")
y_pred_lr = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]))

print("\n=== Random Forest ===")
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

# 6. Save best model
joblib.dump(rf, '/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/random_forest_model.pkl')
joblib.dump(log_reg, '/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/logistic_model.pkl')
joblib.dump(scaler, '/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/scaler.pkl')



def train_clv_model(X_train, y_train, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    elif model_type == 'linear':
        model = LinearRegression()
    else:
        raise ValueError("Unsupported model type")
    
    model.fit(X_train, y_train)
    return model

#overfitting and underfitting

#Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predictions
train_preds = lr_model.predict(X_train)
test_preds = lr_model.predict(X_test)

# Accuracy and AUC
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

train_auc = roc_auc_score(y_train, lr_model.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1])

# Output
print("Logistic Regression Performance:")
print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Train ROC-AUC: {train_auc:.4f}, Test ROC-AUC: {test_auc:.4f}")

#  for Random Forest 



# 1. Define and train the model
rf_model = RandomForestClassifier(
    n_estimators=100,        # Number of trees
    max_depth=10,            # Limit tree depth
    min_samples_split=10,    # Minimum samples to split a node
    min_samples_leaf=4,      # Minimum samples at a leaf node
    max_features='sqrt',     # Random subset of features at each split
    class_weight='balanced', # Handles imbalance
    random_state=42
)


rf_model.fit(X_train, y_train)

# 2. Evaluate for overfitting or underfitting
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)
y_train_proba = rf_model.predict_proba(X_train)[:, 1]
y_test_proba = rf_model.predict_proba(X_test)[:, 1]

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

print(f"Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}")
print(f"Train ROC-AUC:  {train_auc:.4f}, Test ROC-AUC:  {test_auc:.4f}")

# Diagnose
if train_auc - test_auc > 0.05:
    print("\n⚠️ Likely Overfitting: High gap between train and test ROC-AUC.")
elif test_auc - train_auc > 0.05:
    print("\n⚠️ Possible Underfitting: Model generalizes better than it fits training data.")
else:
    print("\n✅ Good Fit: Train and test scores are close.")


#cross validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Cross-validated ROC-AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")



gender              object
Partner             object
Dependents          object
PhoneService        object
MultipleLines       object
InternetService     object
OnlineSecurity      object
OnlineBackup        object
DeviceProtection    object
TechSupport         object
StreamingTV         object
StreamingMovies     object
Contract            object
PaperlessBilling    object
PaymentMethod       object
dtype: object
=== Logistic Regression ===
              precision    recall  f1-score   support

          No       0.88      0.90      0.89      1081
         Yes       0.62      0.58      0.60       321

    accuracy                           0.82      1402
   macro avg       0.75      0.74      0.74      1402
weighted avg       0.82      0.82      0.82      1402

ROC-AUC: 0.8518102253307627

=== Random Forest ===
              precision    recall  f1-score   support

          No       0.85      0.88      0.87      1081
         Yes       0.55      0.48      0.51       321

    accurac

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Performance:
Train Accuracy: 0.8021, Test Accuracy: 0.8210
Train ROC-AUC: 0.8470, Test ROC-AUC: 0.8517
Train Accuracy: 0.8217, Test Accuracy: 0.7675
Train ROC-AUC:  0.9180, Test ROC-AUC:  0.8477

⚠️ Likely Overfitting: High gap between train and test ROC-AUC.
Cross-validated ROC-AUC: 0.8423 ± 0.0112


In [8]:
#evaluation

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def preprocess_for_rf(X_raw, train_columns):
    """
    Preprocess test data for Random Forest:
    - One-hot encode categorical columns
    - Align columns to training columns (add missing cols with zeros)
    """
    X = pd.get_dummies(X_raw)
    # Reindex to train columns, fill missing columns with 0
    X = X.reindex(columns=train_columns, fill_value=0)
    return X




def preprocess_for_logreg(X_raw, train_columns, scaler):
    """
    Preprocess test data for Logistic Regression:
    - One-hot encode categorical columns
    - Align columns to training columns
    - Scale using pre-fitted scaler
    """
    X = pd.get_dummies(X_raw)
    X = X.reindex(columns=train_columns, fill_value=0)
    X_scaled = scaler.transform(X)
    return X_scaled




def evaluate_model(model, X_test, y_test, model_name="model"):
    """
    Evaluates the model, prints metrics, plots confusion matrix and ROC curve,
    saves classification report CSV and plots to outputs folder.
    """
    # Ensure output dirs
    report_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/reports"
    figures_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/figures"
    ensure_dir(report_dir)
    ensure_dir(figures_dir)

    # Predictions
    y_pred = model.predict(X_test)
    y_pred = pd.Series(y_pred).map({'No': 0, 'Yes': 1}).values

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = None

    # Print classification report
    print(f"\n=== Evaluation Report: {model_name} ===")
    print(classification_report(y_test, y_pred))
    if y_prob is not None:
        roc_auc = roc_auc_score(y_test, y_prob)
        print(f"ROC-AUC Score: {roc_auc:.4f}")
    else:
        roc_auc = None
        print("ROC-AUC Score: Not available (no predict_proba method)")

    # Save classification report as CSV
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    df_report = pd.DataFrame(report_dict).transpose()
    report_path = os.path.join(report_dir, f"{model_name.lower().replace(' ', '_')}_classification_report.csv")
    df_report.to_csv(report_path)
    print(f"Saved classification report to {report_path}")

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = os.path.join(figures_dir, f"{model_name.lower().replace(' ', '_')}_confusion_matrix.png")
    plt.savefig(cm_path)
    plt.close()
    print(f"Saved confusion matrix plot to {cm_path}")

    # Plot ROC curve 
    if y_prob is not None:
        from sklearn.metrics import roc_curve, auc
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        plt.figure(figsize=(6,5))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{model_name} Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        roc_path = os.path.join(figures_dir, f"{model_name.lower().replace(' ', '_')}_roc_curve.png")
        plt.savefig(roc_path)
        plt.close()
        print(f"Saved ROC curve plot to {roc_path}")

#clv_model


def evaluate_clv_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.2f}")

    # Save figure
    figures_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/figures"
    reports_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/reports"
    os.makedirs(figures_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.5)
    plt.xlabel("Actual CLV")
    plt.ylabel("Predicted CLV")
    plt.title("Actual vs Predicted CLV")
    plt.savefig(f"{figures_dir}/clv_actual_vs_predicted.png")
    plt.close()


    # Save evaluation report as CSV
    metrics_df = pd.DataFrame({
        'Metric': ['MAE', 'RMSE', 'R2'],
        'Value': [mae, rmse, r2]
    })
    csv_path = f"{reports_dir}/clv_regression_report.csv"
    metrics_df.to_csv(csv_path, index=False)
    print(f"Saved CLV regression report to {csv_path}")

In [9]:
#script evaluation
import os
import pandas as pd
import joblib
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns

# from evaluation import evaluate_model, preprocess_for_rf, preprocess_for_logreg

if __name__ == "__main__":
    # Paths (update if needed)
    X_test_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/X_test.csv"
    y_test_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/y_test.csv"
    rf_model_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/random_forest_model.pkl"
    logreg_model_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/logistic_model.pkl"
    scaler_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/models/scaler.pkl"
    train_columns_path = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/data/processed/X_train_columns.csv"

    # Load test data
    X_test_raw = pd.read_csv(X_test_path)
    y_test = pd.read_csv(y_test_path).squeeze()
    y_test = y_test.map({'No': 0, 'Yes': 1})

# Convert y_test from 'No'/'Yes' to 0/1
   

    # Load models and scaler
    rf_model = joblib.load(rf_model_path)
    logreg_model = joblib.load(logreg_model_path)
    scaler = joblib.load(scaler_path)

    # Load train columns list for dummy alignment
    train_columns = pd.read_csv(train_columns_path, header=None).squeeze().tolist()

    # Preprocess test data for Random Forest and evaluate
    X_test_rf = preprocess_for_rf(X_test_raw, train_columns)
    evaluate_model(rf_model, X_test_rf, y_test, model_name="Random Forest")

    # Preprocess test data for Logistic Regression and evaluate
    X_test_logreg = preprocess_for_logreg(X_test_raw, train_columns, scaler)
    evaluate_model(logreg_model, X_test_logreg, y_test, model_name="Logistic Regression")



=== Evaluation Report: Random Forest ===
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1081
           1       0.55      0.48      0.51       321

    accuracy                           0.79      1402
   macro avg       0.70      0.68      0.69      1402
weighted avg       0.78      0.79      0.79      1402

ROC-AUC Score: 0.8190
Saved classification report to /Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/reports/random_forest_classification_report.csv
Saved confusion matrix plot to /Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/figures/random_forest_confusion_matrix.png
Saved ROC curve plot to /Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/figures/random_forest_roc_curve.png

=== Evaluation Report: Logistic Regression ===
              precision    recall  f1-score   support

           0       0.88      0.90     

In [10]:
#evaluation_clv
# Load processed data
df = pd.read_csv("../data/processed/final_dataset_with_clv.csv")

def clean_data(df):
    df = df.copy()

    # Handle missing or whitespace TotalCharges
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.dropna(inplace=True)

    return df

def create_features(df):
    df = df.copy()

    # Encode categorical variables if not done yet
    if 'Contract' in df.columns:
        df['Contract_encoded'] = df['Contract'].astype('category').cat.codes
    if 'PaymentMethod' in df.columns:
        df['PaymentMethod_encoded'] = df['PaymentMethod'].astype('category').cat.codes

    return df

# Optional: clean again (if needed)
df = clean_data(df)
df = create_features(df)
df = create_clv_target(df)

# Select features and target



features = ['MonthlyCharges', 'tenure', 'TotalCharges', 'Contract_encoded', 'PaymentMethod_encoded']  # Add more as needed
X = df[features]
y = df['CLV']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = train_clv_model(X_train, y_train, model_type='random_forest')

# Evaluate
evaluate_clv_model(model, X_test, y_test)

# Predict values for residual plot
y_pred = model.predict(X_test)




 #saving regression model as csv   


def evaluate_clv_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.2f}")

    # Save figure
    figures_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/figures"
    reports_dir = "/Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/reports"
    os.makedirs(figures_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.5)
    plt.xlabel("Actual CLV")
    plt.ylabel("Predicted CLV")
    plt.title("Actual vs Predicted CLV")
    plt.savefig(f"{figures_dir}/clv_actual_vs_predicted.png")
    plt.close()

    # Save evaluation report as CSV
    metrics_df = pd.DataFrame({
        'Metric': ['MAE', 'RMSE', 'R2'],
        'Value': [mae, rmse, r2]
    })
    csv_path = f"{reports_dir}/clv_regression_report.csv"
    metrics_df.to_csv(csv_path, index=False)
    print(f"Saved CLV regression report to {csv_path}")



MAE: 0.97
RMSE: 1.80
R²: 1.00
Saved CLV regression report to /Users/samirsitaula/Documents/Selfpaced_Practice/projects/customer_clv_churn/outputs/reports/clv_regression_report.csv




In [11]:

#Feature Importance Plot
def plot_feature_importance(model, feature_names, output_path, model_name="model"):
    """
    Plots feature importance for tree-based models.
    """
    importances = model.feature_importances_
    feature_imp_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_imp_df)
    plt.title(f'{model_name} Feature Importance')
    plt.tight_layout()

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.savefig(output_path)
    plt.close()
    print(f"Saved feature importance plot to: {output_path}")


In [12]:
#merge churn and prediction
def merge_predictions(df, churn_preds, clv_preds):
    df = df.copy()
    df['Churn_Probability'] = churn_preds  # Probabilities (or binary)
    df['Predicted_CLV'] = clv_preds
    return df


In [13]:
#Dashboard Summary Generator

def generate_dashboard_summary(df, output_dir="outputs/figures"):
    os.makedirs(output_dir, exist_ok=True)

    # Churn Distribution
    plt.figure(figsize=(6, 4))
    sns.histplot(df['Churn_Probability'], bins=20, kde=True)
    plt.title("Churn Probability Distribution")
    plt.xlabel("Churn Probability")
    plt.savefig(f"{output_dir}/churn_distribution.png")
    plt.close()

    # CLV Distribution
    plt.figure(figsize=(6, 4))
    sns.histplot(df['Predicted_CLV'], bins=30, kde=True)
    plt.title("Predicted CLV Distribution")
    plt.xlabel("CLV")
    plt.savefig(f"{output_dir}/clv_distribution.png")
    plt.close()

    # CLV vs Churn Probability
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x='Churn_Probability', y='Predicted_CLV', data=df, alpha=0.5)
    plt.title("CLV vs Churn Probability")
    plt.savefig(f"{output_dir}/clv_vs_churn.png")
    plt.close()

    print(f"Saved all summary figures to: {output_dir}")


In [14]:
df_final = merge_predictions(df, churn_preds, clv_preds)
df_final.to_csv("outputs/reports/final_dashboard_data.csv", index=False)

NameError: name 'churn_preds' is not defined