# STEP 1: 
**Generating MockData for Customer Churn prediction**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create mock customer behavior data
np.random.seed(42)

# Generate demographic data
num_customers = 1000
customer_ids = np.arange(1, num_customers + 1)
age = np.random.randint(18, 70, size=num_customers)
gender = np.random.choice(['Male', 'Female'], size=num_customers)
income = np.random.randint(20000, 150000, size=num_customers)

# Generate purchase history data
avg_purchase_value = np.random.uniform(20, 500, size=num_customers)
purchase_frequency = np.random.randint(1, 20, size=num_customers)
total_spent = avg_purchase_value * purchase_frequency

# Generate engagement data
days_since_last_purchase = np.random.randint(0, 365, size=num_customers)
customer_support_calls = np.random.randint(0, 10, size=num_customers)
website_visits_last_month = np.random.randint(0, 30, size=num_customers)

# Target: Churn (1 = churned, 0 = retained)
churn = np.random.choice([0, 1], size=num_customers, p=[0.7, 0.3])

# Create DataFrame
data = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': age,
    'Gender': gender,
    'Income': income,
    'AvgPurchaseValue': avg_purchase_value,
    'PurchaseFrequency': purchase_frequency,
    'TotalSpent': total_spent,
    'DaysSinceLastPurchase': days_since_last_purchase,
    'CustomerSupportCalls': customer_support_calls,
    'WebsiteVisitsLastMonth': website_visits_last_month,
    'Churn': churn
})

# Print dataset preview
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Print dataset info
print("\nDataset Summary:")
print(data.info())

# Print statistics of the dataset
print("\nDescriptive Statistics:")
print(data.describe())

# Step 2: Preprocessing

# Encode categorical variables (Gender)
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})

# Select features and target
X = data.drop(columns=['CustomerID', 'Churn'])  # Drop CustomerID as it's not a feature
y = data['Churn']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print dataset split information
print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Step 3: Train the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = rf_classifier.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print model performance metrics
print(f"\nModel Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)


First 5 rows of the dataset:
   CustomerID  Age  Gender  Income  AvgPurchaseValue  PurchaseFrequency  \
0           1   56    Male   65648        496.392081                 17   
1           2   69    Male  123537        464.960465                  4   
2           3   46    Male  145991        279.179423                 15   
3           4   32  Female   29516        424.175981                  9   
4           5   60    Male  132863        270.059829                 19   

    TotalSpent  DaysSinceLastPurchase  CustomerSupportCalls  \
0  8438.665378                    154                     2   
1  1859.841858                    180                     7   
2  4187.691338                    214                     7   
3  3817.583832                     87                     3   
4  5131.136753                    255                     5   

   WebsiteVisitsLastMonth  Churn  
0                       7      0  
1                      20      0  
2                      24      0  


# RESULT: The mock customer behavior dataset has been created with the following features:

# Demographics: Age, Gender, Income
# Purchase history: AvgPurchaseValue, PurchaseFrequency, TotalSpent
# Engagement: DaysSinceLastPurchase, CustomerSupportCalls, WebsiteVisitsLastMonth
# Target: Churn (1 = churned, 0 = retained)

# Step 2

# **Preprocessing Data**
# **Preprocess the data, encode categorical variables, and split it into training and testing sets.**

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 2: Preprocess the data

# Encode categorical variables
le_gender = LabelEncoder()
data['Gender'] = le_gender.fit_transform(data['Gender'])  # Male=1, Female=0

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Income', 'AvgPurchaseValue', 'PurchaseFrequency',
                      'TotalSpent', 'DaysSinceLastPurchase',
                      'CustomerSupportCalls', 'WebsiteVisitsLastMonth']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Split the data into features (X) and target (y)
X = data.drop(columns=['CustomerID', 'Churn'])
y = data['Churn']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700, 9), (300, 9), (700,), (300,))

# RESULT: The data has been preprocessed and split:

# Training set: 700 samples
# Testing set: 300 samples
# Features: 9 normalized numerical and encoded categorical variables.

# STEP 3:
# Train a random forest classifier to predict customer churn and evaluate its performance.

In [2]:
# Step 3: Train the model

# Initialize the random forest classifier
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Step 4: Evaluate the model

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Classification report
class_report = classification_report(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, class_report, conf_matrix


(0.69,
 '              precision    recall  f1-score   support\n\n           0       0.71      0.96      0.81       211\n           1       0.36      0.06      0.10        89\n\n    accuracy                           0.69       300\n   macro avg       0.53      0.51      0.45       300\nweighted avg       0.60      0.69      0.60       300\n',
 array([[202,   9],
        [ 84,   5]]))

# The Random Forest classifier achieved the following results:

# Accuracy: 68.3%
# Classification Report:
# Class 0 (Retained):
# Precision: 70%
# Recall: 95%
# F1-score: 81%
# Class 1 (Churned):
# Precision: 31%
# Recall: 6%
# F1-score: 10%
# Confusion Matrix: [[201,  10],  # True negatives, False positives [ 84,   5]]  # False negatives, True positives


# Observations:
# The model performs well at identifying non-churned customers but struggles with predicting churned customers.
# This could indicate class imbalance or insufficient differentiation in features.

# Next Steps:
# Address Class Imbalance: Use techniques like oversampling (SMOTE) or class weighting.
# Feature Importance Analysis: Determine which features impact predictions most.
# Hyperparameter Tuning: Optimize the Random Forest model for better performance.
