In [None]:
import math , copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv("Telco-customer-churn.csv")

In [None]:
# Add the engineer_features function here
def engineer_features(df):
    # Make a copy to avoid modifying the original
    df_eng = df.copy()
    
    # Convert TotalCharges to numeric
    df_eng['TotalCharges'] = pd.to_numeric(df_eng['TotalCharges'], errors='coerce')
    df_eng['TotalCharges'] = df_eng['TotalCharges'].fillna(0)
    
    # Create interaction terms
    # Tenure and contract type interaction
    df_eng['tenure_month_to_month'] = df_eng['tenure'] * (df_eng['Contract'] == 'Month-to-month').astype(int)
    df_eng['tenure_one_year'] = df_eng['tenure'] * (df_eng['Contract'] == 'One year').astype(int)
    df_eng['tenure_two_year'] = df_eng['tenure'] * (df_eng['Contract'] == 'Two year').astype(int)
    
    # Monthly charges and tenure interaction
    df_eng['charge_per_tenure'] = df_eng['MonthlyCharges'] / (df_eng['tenure'] + 1)  # +1 to avoid division by zero
    
    # Create ratio features
    df_eng['avg_monthly_spend'] = df_eng['TotalCharges'] / (df_eng['tenure'] + 1)
    df_eng['recent_vs_lifetime_spend'] = df_eng['MonthlyCharges'] / (df_eng['TotalCharges'] + 1)
    
    # Service aggregation feature
    df_eng['total_services'] = (
        (df_eng['PhoneService'] == 'Yes').astype(int) +
        (df_eng['MultipleLines'] == 'Yes').astype(int) +
        (df_eng['InternetService'] != 'No').astype(int) +
        (df_eng['OnlineSecurity'] == 'Yes').astype(int) +
        (df_eng['OnlineBackup'] == 'Yes').astype(int) +
        (df_eng['DeviceProtection'] == 'Yes').astype(int) +
        (df_eng['TechSupport'] == 'Yes').astype(int) +
        (df_eng['StreamingTV'] == 'Yes').astype(int) +
        (df_eng['StreamingMovies'] == 'Yes').astype(int)
    )
    
    # Customer loyalty feature - combines tenure with contract type
    contract_value = df_eng['Contract'].map({'Month-to-month': 1, 'One year': 12, 'Two year': 24})
    df_eng['loyalty_score'] = df_eng['tenure'] * contract_value
    
    return df_eng

In [None]:
df = engineer_features(df)
print("Data shape after feature engineering:", df.shape)
print("New features added:", [col for col in df.columns if col not in pd.read_csv("Telco-customer-churn.csv").columns])


In [None]:
# Check data types and missing values
print("\nData types and missing values:")
missing_data = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Values': df.isnull().sum(),
    'Missing Percentage': (df.isnull().sum() / len(df)) * 100
})
print(missing_data)

# Check unique values in each column
for col in df.columns:
    if df[col].dtype == 'object' or len(df[col].unique()) < 10:
        print(f"\nUnique values in {col}:")
        print(df[col].value_counts())

In [None]:
# Set up the plots
plt.figure(figsize=(18, 12))
plt.suptitle('Customer Churn Analysis', fontsize=20)

# 1. Distribution of churn
plt.subplot(2, 3, 1)
churn_counts = df['Churn'].value_counts()
plt.pie(churn_counts, labels=['No', 'Yes'], autopct='%1.1f%%', startangle=90, colors=['lightblue', 'coral'])
plt.title('Distribution of Churn')

# 2. Tenure vs Churn
plt.subplot(2, 3, 2)
sns.boxplot(x='Churn', y='tenure', data=df)
plt.title('Tenure vs Churn')

# 3. Monthly Charges vs Churn
plt.subplot(2, 3, 3)
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('Monthly Charges vs Churn')

# 4. Contract Type vs Churn
plt.subplot(2, 3, 4)
contract_churn = pd.crosstab(df['Contract'], df['Churn'])
contract_churn_pct = contract_churn.div(contract_churn.sum(axis=1), axis=0) * 100
contract_churn_pct['Yes'].plot(kind='bar', color='coral')
plt.title('Churn Rate by Contract Type')
plt.ylabel('Churn Rate (%)')

# 5. Payment Method vs Churn
plt.subplot(2, 3, 5)
payment_churn = pd.crosstab(df['PaymentMethod'], df['Churn'])
payment_churn_pct = payment_churn.div(payment_churn.sum(axis=1), axis=0) * 100
payment_churn_pct['Yes'].plot(kind='bar', color='coral')
plt.title('Churn Rate by Payment Method')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Churn Rate (%)')

# 6. Correlation matrix of numerical variables
plt.subplot(2, 3, 6)
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')
correlation = df[numerical_cols + ['SeniorCitizen']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
print(df.columns)

In [None]:
#Preprocessing data

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns.remove('customerID') 
#categorical_columns.remove('TotalCharges')# Remove identifier
if 'Churn' in categorical_columns:
    categorical_columns.remove('Churn')  # Remove target variable for now

#print(f"\nCategorical columns to encode: {categorical_columns}")

# Initialize the encoder for all categorical columns
full_encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform all categorical columns
encoded_cats = full_encoder.fit_transform(df[categorical_columns])

# Get feature names
encoded_feature_names = full_encoder.get_feature_names_out(categorical_columns)
#print(f"\nNumber of features after encoding: {len(encoded_feature_names)}")
#print("First 10 encoded features:", encoded_feature_names[:10])

# Create DataFrame with encoded features
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_feature_names)

# Fix TotalCharges column - convert to numeric
#df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
#df['TotalCharges'] = df['TotalCharges'].fillna(0)
# Get numerical columns (excluding customerID)
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'customerID' in numerical_columns:
    numerical_columns.remove('customerID')

print(f"\nNumerical columns: {numerical_columns}")
if 'customerID' in numerical_columns:
    numerical_columns.remove('customerID')

#print(f"\nNumerical columns: {numerical_columns}")

# Copy numerical data
numerical_df = df[numerical_columns].reset_index(drop=True)

# Convert target variable to numeric (0/1)
churn_numeric = df['Churn'].map({'Yes': 1, 'No': 0}).reset_index(drop=True)

final_df = pd.concat([numerical_df, encoded_df, churn_numeric], axis=1)

print("\nFinal dataset after one-hot encoding:")
print(f"Shape: {final_df.shape}")
print("First 5 rows:")
print(final_df.head())

In [None]:
def feature_scaling(x_train , return_params=False):
    """
    Perform feature scaling using standardization (Z-score normalization)
    
    Parameters:
    x_train (numpy.ndarray): Input feature matrix
    
    Returns:
    numpy.ndarray: Scaled feature matrix
    """
    # Create a copy to avoid modifying the original array
          
    scaled_x_train = x_train.copy().astype('float')
    scaling_columns = ['tenure', 'MonthlyCharges', 'TotalCharges' , 'tenure_month_to_month','charge_per_tenure',  'avg_monthly_spend',
                      'tenure_one_year', 'tenure_two_year', 'recent_vs_lifetime_spend' , 'total_services' , 'loyalty_score']
    #scaling_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
    params = {'mean': {}, 'std': {}}
   
    # Scale each column (feature) separately
    for col in scaling_columns:
        mean = x_train[col].mean()
        std = x_train[col].std()
        params['mean'][col] = mean
        params['std'][col] = std
        
        # Apply z-score normalization with handling for zero std
        if std == 0:
            scaled_x_train[col] = 0
        else:
            scaled_x_train[col] = (x_train[col] - mean) / std
            
    if return_params:
        return scaled_x_train, params
    else:
        return scaled_x_train

In [None]:

#VERIFICATION OF PREPROCESSING PROCESS
numeric_columns = final_df.select_dtypes(include=['int64' , 'float64']).columns.tolist()
print(numeric_columns)
print("\nData types and missing values:")
missing_data = pd.DataFrame({
    'Data Type': final_df.dtypes,
    'Missing Values': final_df.isnull().sum(),
    'Missing Percentage': (final_df.isnull().sum() / len(final_df)) * 100
})
print(missing_data)



In [None]:
def compute_model_prediction(x, w, b):
    """
    Vectorized sigmoid activation function
    Args:
        x: input features (m, n) or single example (n,)
        w: weights (n,)
        b: bias (scalar)
    Returns:
        sigmoid(z) where z = x·w + b
    """
    z = np.dot(x, w) + b
    return 1 / (1 + np.exp(-z))

In [None]:
def compute_cost(x,w,b,y):
    m = x.shape[0]
    sum = 0
    predictions = compute_model_prediction(x, w, b)
    cost = -1/m * np.sum(y*np.log(predictions) + (1-y)*np.log(1-predictions))
    return cost

In [None]:
def compute_gradient_descent(x_train , y_train , w , b):
    m , n = x_train.shape
    dw = np.zeros((n,)) 
    db = 0
   
    model_prediction = compute_model_prediction(x_train, w, b)
    diff = (model_prediction - y_train)
    
    # Vectorized computation of gradients
    dw_dj = (1/m) * np.dot(x_train.T, diff)
    db_dj = (1/m) * np.sum(diff)
    
    return dw_dj , db_dj
        

In [None]:
def gradient_descent(x_train , y_train , w , b , alpha_in , iteration):
    m , n = x_train.shape
 
    alpha = alpha_in
    cost = compute_cost(x_train , w ,b , y_train)
    costs = []
    iterations = []
    w1_range = []
    w2_range = []
    w3_range = []
    for i in range(iteration):
 
        dw_dj , db_dj = compute_gradient_descent(x_train , y_train , w , b)
        w = w - alpha * dw_dj
        b = b - alpha * db_dj
        cost = compute_cost(x_train , w ,b , y_train)
        costs.append(cost)
        iterations.append(i)
        #print(w , w[0])
        #w1_range.append(w[0])
        #w2_range.append(w[1])
        #print(w , " " , b , " " , cost)
        
    return w , b , costs , iterations

In [None]:
def compute_cost_with_regularization(x, w, b, y, lambda_):
    """
    Compute cost with L2 regularization
    """
    m = x.shape[0]
    predictions = compute_model_prediction(x, w, b)
    
    # Compute regular cost
    regular_cost = -1/m * np.sum(y*np.log(predictions) + (1-y)*np.log(1-predictions))
    
    # Add regularization term (excluding bias)
    reg_cost = (lambda_/(2*m)) * np.sum(w**2)
    
    return regular_cost + reg_cost


In [None]:
def compute_gradient_with_regularization(x_train, y_train, w, b, lambda_):
    """
    Compute gradient with L2 regularization
    """
    m, n = x_train.shape
    
    model_prediction = compute_model_prediction(x_train, w, b)
    diff = (model_prediction - y_train)
    
    # Gradient of cost without regularization
    dw_j = (1/m) * np.dot(x_train.T, diff)
    db_j = (1/m) * np.sum(diff)
    
    # Add regularization term to gradient (not to bias)
    dw_j = dw_j + (lambda_/m) * w
    
    return dw_j, db_j


In [None]:
def gradient_descent_with_regularization(x_train, y_train, w, b, alpha_in, iteration, lambda_):
    """
    Gradient descent with regularization
    """
    m, n = x_train.shape
    alpha = alpha_in
    cost = compute_cost_with_regularization(x_train, w, b, y_train, lambda_)
    costs = []
    iterations = []
    
    for i in range(iteration):
        dw_j, db_j = compute_gradient_with_regularization(x_train, y_train, w, b, lambda_)
        w = w - alpha * dw_j
        b = b - alpha * db_j
        
        
        cost = compute_cost_with_regularization(x_train, w, b, y_train, lambda_)
        costs.append(cost)
        iterations.append(i)
            
    return w, b, costs, iterations

In [None]:
def plot_cost_vs_iteration(iterations, costs):
    """
    Plot a graph of cost versus iteration.
    
    Parameters:
    iterations (list or numpy array): Number of iterations
    costs (list or numpy array): Corresponding cost values
    """
    plt.figure(figsize=(10, 10))
    plt.plot(iterations, costs, marker='o')
    plt.title('Cost vs Iteration', fontsize=16)
    plt.xlabel('Iteration', fontsize=12)
    plt.ylabel('Cost', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
def evaluate_classification_model(X, y_true, w, b):
    """
    Evaluate the binary classification model
    
    Parameters:
    X: Features
    y_true: True labels
    w: Weights
    b: Bias
    
    Returns:
    accuracy, precision, recall, f1_score
    """
    # Get predictions (probabilities)
    y_prob = compute_model_prediction(X, w, b)
    
    # Convert to binary predictions using 0.5 threshold
    y_pred = (y_prob >= 0.25).astype(int)
    
    # Calculate metrics
    accuracy = np.mean(y_pred == y_true)
    
    # Avoid division by zero
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    predicted_positives = np.sum(y_pred == 1)
    actual_positives = np.sum(y_true == 1)
    
    precision = true_positives / predicted_positives if predicted_positives > 0 else 0
    recall = true_positives / actual_positives if actual_positives > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

     
    return accuracy, precision, recall, f1

In [None]:
def plot_predictions_vs_actual(y_actual, y_pred, title):
    """
    Plot predicted values against actual values.
    """
    plt.figure(figsize=(10, 6))
    plt.scatter(y_actual, y_pred, alpha=0.5)
    
    # Add a perfect prediction line
    min_val = min(np.min(y_actual), np.min(y_pred))
    max_val = max(np.max(y_actual), np.max(y_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
    
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title(title)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    """
    Plot confusion matrix for binary classification
    
    Parameters:
    y_true: True labels
    y_pred: Predicted labels
    title: Plot title
    """
    # Compute confusion matrix
    cm = np.zeros((2, 2))
    cm[0, 0] = np.sum((y_pred == 0) & (y_true == 0))  # True Negatives
    cm[0, 1] = np.sum((y_pred == 0) & (y_true == 1))  # False Negatives
    cm[1, 0] = np.sum((y_pred == 1) & (y_true == 0))  # False Positives
    cm[1, 1] = np.sum((y_pred == 1) & (y_true == 1))  # True Positives
    
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title, fontsize=16)
    plt.colorbar()
    
    classes = ['Not Churned (0)', 'Churned (1)']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    # Add text annotations
    thresh = cm.max() / 2.
    for i in range(2):
        for j in range(2):
            plt.text(j, i, f"{int(cm[i, j])}", 
                     horizontalalignment="center", 
                     color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()



In [None]:
def plot_roc_curve(y_true, y_score, title):
    """
    Plot ROC curve with correct AUC calculation
    
    Parameters:
    y_true: True labels
    y_score: Predicted probabilities
    title: Plot title
    """
    # Get unique threshold values (sorted predicted probabilities)
    thresholds = np.sort(np.unique(y_score))
    # Add 0 and 1 to ensure full curve
    thresholds = np.append(thresholds, [0, 1])
    thresholds = np.sort(thresholds)
    
    n_thresholds = len(thresholds)
    tpr = np.zeros(n_thresholds)
    fpr = np.zeros(n_thresholds)
    
    # Calculate TPR and FPR for each threshold
    for i, threshold in enumerate(thresholds):
        y_pred = (y_score >= threshold).astype(int)
        
        # True positives and false positives
        tp = np.sum((y_pred == 1) & (y_true == 1))
        fp = np.sum((y_pred == 1) & (y_true == 0))
        
        # True negatives and false negatives
        tn = np.sum((y_pred == 0) & (y_true == 0))
        fn = np.sum((y_pred == 0) & (y_true == 1))
        
        # TPR and FPR
        tpr[i] = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr[i] = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    # Sort by increasing FPR for proper curve
    sorted_indices = np.argsort(fpr)
    fpr_sorted = fpr[sorted_indices]
    tpr_sorted = tpr[sorted_indices]
    
    # Calculate AUC using trapezoidal rule
    auc = np.trapz(tpr_sorted, fpr_sorted)
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr_sorted, tpr_sorted, 'b-', linewidth=2)
    plt.plot([0, 1], [0, 1], 'r--')  # Diagonal line
    plt.xlim([0, 1])
    plt.ylim([0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{title} (AUC = {auc:.4f})')
    plt.grid(True)
    plt.show()

In [None]:
def plot_feature_importance(X, weights, n_top=20):
    """
    Plot feature importance based on the magnitude of weights
    
    Parameters:
    X: DataFrame with feature names
    weights: Model weights
    n_top: Number of top features to display (default: 20)
    """
    # Get feature names
    feature_names = X.columns
    
    # Create DataFrame with feature names and weights
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Weight': weights,
        'Absolute Weight': np.abs(weights)
    })
    
    # Sort by absolute weight value (importance)
    importance_df = importance_df.sort_values('Absolute Weight', ascending=False)
  
    
    sorted_feature_names = importance_df['Feature'].tolist()
    
    # Clean print in list format: ['feature1', 'feature2', ..., 'featureN']
    print("\nSorted Feature Names (Python list format):")
    print("[" + ", ".join(f"'{col}'" for col in sorted_feature_names) + "]")
    # Take top n features
    #if n_top is not None:
    #    importance_df = importance_df.head(n_top)
    
    # Plot
    plt.figure(figsize=(12, 8))
    colors = ['blue' if w > 0 else 'red' for w in importance_df['Weight']]
    plt.barh(importance_df['Feature'], importance_df['Absolute Weight'], color=colors)
    plt.xlabel('Absolute Weight Magnitude')
    plt.ylabel('Feature')
    plt.title('Feature Importance (Weight Magnitude)')
    
    # Add a legend
    import matplotlib.patches as mpatches
    blue_patch = mpatches.Patch(color='blue', label='Positive Impact (Decreases Churn)')
    red_patch = mpatches.Patch(color='red', label='Negative Impact (Increases Churn)')
    plt.legend(handles=[blue_patch, red_patch], loc='lower right')
    
    plt.tight_layout()
    plt.show()

In [None]:
#scaling 3 column tenure , MonthlyCharges  TotalCharges
X = final_df.drop('Churn', axis=1)
y = final_df['Churn']

# Create training and test sets (80% train, 20% test is common)
X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale only the numerical features in both sets using the parameters from training set
X_train_scaled, scaling_params = feature_scaling(X_train, return_params=True)

# Apply the same scaling parameters to test set
X_val_scaled = X_test.copy().astype('float')
for col in ['tenure', 'MonthlyCharges', 'TotalCharges' , 'tenure_month_to_month','charge_per_tenure',  'avg_monthly_spend',
                       'tenure_one_year', 'tenure_two_year', 'recent_vs_lifetime_spend']:
    X_val_scaled[col] = (X_test[col] - scaling_params['mean'][col]) / scaling_params['std'][col]

m, n = X_train_scaled.shape
w = np.zeros(n)
b = 0
print("Initial weights and bias:", w, b)

# Train model
w, b,  costs, iterations = gradient_descent_with_regularization(X_train_scaled, y_train, w, b, 0.6, 40000, 0.02)
print("Final weights and bias:", w, b)

# Evaluate on training set
train_accuracy, train_precision, train_recall, train_f1 = evaluate_classification_model(X_train_scaled, y_train, w, b)
print("\nTraining Set Performance:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")

# Evaluate on test set
test_accuracy, test_precision, test_recall, test_f1 = evaluate_classification_model(X_val_scaled, y_val, w, b)
print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

# Make predictions
y_train_pred_prob = compute_model_prediction(X_train_scaled, w , b)
y_test_pred_prob = compute_model_prediction(X_val_scaled, w , b)

# Convert to binary predictions
y_train_pred = (y_train_pred_prob >= 0.5).astype(int)
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)
# Plot predictions
plot_cost_vs_iteration(iterations, costs)

plot_confusion_matrix(y_train, y_train_pred, 'Training Set: Confusion Matrix')
plot_confusion_matrix(y_val, y_test_pred, 'Test Set: Confusion Matrix')
# Plot feature importance

plot_roc_curve(y_train, y_train_pred_prob, 'Training Set: ROC Curve')
plot_roc_curve(y_val, y_test_pred_prob, 'Test Set: ROC Curve')

plot_feature_importance(X_train_scaled, w)

In [None]:
# Add this code after your existing evaluation
print("\n--- Testing different thresholds ---")

# Test different threshold values
thresholds = [0.25, 0.3, 0.4, 0.5, 0.6]

for threshold in thresholds:
    # Apply threshold to predictions
    y_train_pred_threshold = (y_train_pred_prob >= threshold).astype(int)
    y_test_pred_threshold = (y_test_pred_prob >= threshold).astype(int)
    
    # Calculate metrics for training set
    train_tp = np.sum((y_train_pred_threshold == 1) & (y_train == 1))
    train_fp = np.sum((y_train_pred_threshold == 1) & (y_train == 0))
    train_fn = np.sum((y_train_pred_threshold == 0) & (y_train == 1))
    train_precision = train_tp / (train_tp + train_fp) if (train_tp + train_fp) > 0 else 0
    train_recall = train_tp / (train_tp + train_fn) if (train_tp + train_fn) > 0 else 0
    train_f1 = 2 * train_precision * train_recall / (train_precision + train_recall) if (train_precision + train_recall) > 0 else 0
    
    # Calculate metrics for test set
    test_tp = np.sum((y_test_pred_threshold == 1) & (y_val == 1))
    test_fp = np.sum((y_test_pred_threshold == 1) & (y_val == 0))
    test_fn = np.sum((y_test_pred_threshold == 0) & (y_val == 1))
    test_precision = test_tp / (test_tp + test_fp) if (test_tp + test_fp) > 0 else 0
    test_recall = test_tp / (test_tp + test_fn) if (test_tp + test_fn) > 0 else 0
    test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall) if (test_precision + test_recall) > 0 else 0
    
    print(f"\nThreshold = {threshold}")
    print(f"Train: Precision = {train_precision:.4f}, Recall = {train_recall:.4f}, F1 = {train_f1:.4f}")
    print(f"Test: Precision = {test_precision:.4f}, Recall = {test_recall:.4f}, F1 = {test_f1:.4f}")
    
    # Plot confusion matrix for test set
''' plt.figure(figsize=(8, 6))
    cm = np.zeros((2, 2))
    cm[0, 0] = np.sum((y_test_pred_threshold == 0) & (y_test == 0))  # TN
    cm[0, 1] = test_fn  # FN
    cm[1, 0] = test_fp  # FP
    cm[1, 1] = test_tp  # TP
    
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix (Threshold = {threshold})', fontsize=16)
    plt.colorbar()
    
    classes = ['Not Churned (0)', 'Churned (1)']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    thresh = cm.max() / 2.
    for i in range(2):
        for j in range(2):
            plt.text(j, i, f"{int(cm[i, j])}", 
                     horizontalalignment="center", 
                     color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()'''

In [None]:
#scaling 3 column tenure , MonthlyCharges  TotalCharges
X = final_df.drop('Churn', axis=1)
y = final_df['Churn']

# Create training and test sets (80% train, 20% test is common)
X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale only the numerical features in both sets using the parameters from training set
X_train_scaled, scaling_params = feature_scaling(X_train, return_params=True)

# Apply the same scaling parameters to test set
X_val_scaled = X_test.copy().astype('float')
for col in ['tenure', 'MonthlyCharges', 'TotalCharges' , 'tenure_month_to_month','charge_per_tenure',  'avg_monthly_spend',
                       'tenure_one_year', 'tenure_two_year', 'recent_vs_lifetime_spend']:
    X_val_scaled[col] = (X_test[col] - scaling_params['mean'][col]) / scaling_params['std'][col]

noisy_cols = ['StreamingTV_No internet service', 'DeviceProtection_No internet service', 
 'TechSupport_No internet service', 'StreamingMovies_No internet service', 'InternetService_No', 'OnlineBackup_No internet service',
 'OnlineSecurity_No internet service', 'Dependents_Yes', 'DeviceProtection_Yes', 'tenure_one_year', 'loyalty_score', 
 'PaymentMethod_Credit card (automatic)', 'Partner_Yes',
 'PaymentMethod_Mailed check', 'avg_monthly_spend', 'recent_vs_lifetime_spend', 'tenure_two_year']

X_train_scaled = X_train_scaled.drop(columns=noisy_cols, errors='ignore')
X_val_scaled = X_val_scaled.drop(columns=noisy_cols, errors='ignore')
m, n = X_train_scaled.shape
w = np.zeros(n)
b = 0
print("Initial weights and bias:", w, b)

# Train model
w, b,  costs, iterations = gradient_descent_with_regularization(X_train_scaled, y_train, w, b, 0.6, 30000, 150)
print("Final weights and bias:", w, b)

# Evaluate on training set
train_accuracy, train_precision, train_recall, train_f1 = evaluate_classification_model(X_train_scaled, y_train, w, b)
print("\nTraining Set Performance:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")

# Evaluate on test set
test_accuracy, test_precision, test_recall, test_f1 = evaluate_classification_model(X_val_scaled, y_val, w, b)
print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

# Make predictions
y_train_pred_prob = compute_model_prediction(X_train_scaled, w , b)
y_test_pred_prob = compute_model_prediction(X_val_scaled, w , b)

# Convert to binary predictions
y_train_pred = (y_train_pred_prob >= 0.5).astype(int)
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)
# Plot predictions
plot_cost_vs_iteration(iterations, costs)

plot_confusion_matrix(y_train, y_train_pred, 'Training Set: Confusion Matrix')
plot_confusion_matrix(y_val, y_test_pred, 'Test Set: Confusion Matrix')
# Plot feature importance

plot_roc_curve(y_train, y_train_pred_prob, 'Training Set: ROC Curve')
plot_roc_curve(y_val, y_test_pred_prob, 'Test Set: ROC Curve')

plot_feature_importance(X_train_scaled, w)

In [None]:
# Add this code after your existing evaluation
print("\n--- Testing different thresholds ---")

# Test different threshold values
thresholds = [0.25, 0.3, 0.4, 0.5, 0.6]

for threshold in thresholds:
    # Apply threshold to predictions
    y_train_pred_threshold = (y_train_pred_prob >= threshold).astype(int)
    y_test_pred_threshold = (y_test_pred_prob >= threshold).astype(int)
    
    # Calculate metrics for training set
    train_tp = np.sum((y_train_pred_threshold == 1) & (y_train == 1))
    train_fp = np.sum((y_train_pred_threshold == 1) & (y_train == 0))
    train_fn = np.sum((y_train_pred_threshold == 0) & (y_train == 1))
    train_precision = train_tp / (train_tp + train_fp) if (train_tp + train_fp) > 0 else 0
    train_recall = train_tp / (train_tp + train_fn) if (train_tp + train_fn) > 0 else 0
    train_f1 = 2 * train_precision * train_recall / (train_precision + train_recall) if (train_precision + train_recall) > 0 else 0
    
    # Calculate metrics for test set
    test_tp = np.sum((y_test_pred_threshold == 1) & (y_val == 1))
    test_fp = np.sum((y_test_pred_threshold == 1) & (y_val == 0))
    test_fn = np.sum((y_test_pred_threshold == 0) & (y_val == 1))
    test_precision = test_tp / (test_tp + test_fp) if (test_tp + test_fp) > 0 else 0
    test_recall = test_tp / (test_tp + test_fn) if (test_tp + test_fn) > 0 else 0
    test_f1 = 2 * test_precision * test_recall / (test_precision + test_recall) if (test_precision + test_recall) > 0 else 0
    
    print(f"\nThreshold = {threshold}")
    print(f"Train: Precision = {train_precision:.4f}, Recall = {train_recall:.4f}, F1 = {train_f1:.4f}")
    print(f"Test: Precision = {test_precision:.4f}, Recall = {test_recall:.4f}, F1 = {test_f1:.4f}")
    
    # Plot confusion matrix for test set
''' plt.figure(figsize=(8, 6))
    cm = np.zeros((2, 2))
    cm[0, 0] = np.sum((y_test_pred_threshold == 0) & (y_test == 0))  # TN
    cm[0, 1] = test_fn  # FN
    cm[1, 0] = test_fp  # FP
    cm[1, 1] = test_tp  # TP
    
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix (Threshold = {threshold})', fontsize=16)
    plt.colorbar()
    
    classes = ['Not Churned (0)', 'Churned (1)']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    thresh = cm.max() / 2.
    for i in range(2):
        for j in range(2):
            plt.text(j, i, f"{int(cm[i, j])}", 
                     horizontalalignment="center", 
                     color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()'''

In [None]:
def cross_validate_model(X, y, n_folds=5, lambda_values=None, learning_rate=0.5, iterations=10000):
    """
    Perform k-fold cross-validation to evaluate model performance with different lambda values
    
    Parameters:
    X: Features DataFrame
    y: Target variable
    n_folds: Number of CV folds
    lambda_values: List of lambda values to test
    learning_rate: Learning rate for gradient descent
    iterations: Number of iterations for gradient descent
    
    Returns:
    Dictionary with results for each lambda value
    """
    if lambda_values is None:
        lambda_values = [100.0,130,140 , 145 ,150, 155,  160.0 , 165 , 170 , 175 , 180 , 185 , 190 ,195 ,  200]
    
    # Initialize results dictionary
    results = {lamb: {'train_accuracy': [], 'val_accuracy': [], 
                      'train_precision': [], 'val_precision': [],
                      'train_recall': [], 'val_recall': [],
                      'train_f1': [], 'val_f1': []} 
              for lamb in lambda_values}
    
    # Create indices for k-fold cross-validation
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    fold_size = n_samples // n_folds
    folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(n_folds)]
    
    # For the last fold, include any remaining samples
    if n_samples % n_folds != 0:
        folds[-1] = np.concatenate([folds[-1], indices[n_folds*fold_size:]])
    
    # Start cross-validation
    for fold_idx, val_indices in enumerate(folds):
        print(f"\nProcessing fold {fold_idx+1}/{n_folds}")
        
        # Split data into training and validation
        train_indices = np.concatenate([folds[i] for i in range(n_folds) if i != fold_idx])
        
        X_train_fold = X.iloc[train_indices]
        y_train_fold = y.iloc[train_indices]
        X_val_fold = X.iloc[val_indices]
        y_val_fold = y.iloc[val_indices]
        
        # Engineer features
        #X_train_fold = engineer_features(X_train_fold)
        #X_val_fold = engineer_features(X_val_fold)
        
        # Scale features
        X_train_scaled, scaling_params = feature_scaling(X_train_fold, return_params=True)
        
        # Apply the same scaling to validation set
        X_val_scaled = X_val_fold.copy().astype('float')
        for col in ['tenure', 'MonthlyCharges', 'TotalCharges', 'tenure_month_to_month',
                   'charge_per_tenure', 'avg_monthly_spend', 'tenure_one_year', 
                   'tenure_two_year', 'recent_vs_lifetime_spend', 'total_services', 'loyalty_score']:
            if col in scaling_params['mean'] and col in X_val_scaled.columns:
                X_val_scaled[col] = (X_val_fold[col] - scaling_params['mean'][col]) / scaling_params['std'][col]
        
        # Define noisy columns to remove
        noisy_cols = ['StreamingTV_No internet service', 'DeviceProtection_No internet service', 
         'TechSupport_No internet service', 'StreamingMovies_No internet service', 'InternetService_No', 'OnlineBackup_No internet service',
         'OnlineSecurity_No internet service', 'Dependents_Yes', 'DeviceProtection_Yes', 'tenure_one_year', 'loyalty_score', 
         'PaymentMethod_Credit card (automatic)', 'Partner_Yes',
         'PaymentMethod_Mailed check', 'avg_monthly_spend', 'recent_vs_lifetime_spend', 'tenure_two_year']
             
        # Remove noisy columns
        X_train_clean = X_train_scaled.drop(columns=noisy_cols, errors='ignore')
        X_val_clean = X_val_scaled.drop(columns=noisy_cols, errors='ignore')
        
        # Evaluate each lambda value
        for lamb in lambda_values:
            print(f"  Testing lambda={lamb}")
            
            # Initialize weights and bias
            m, n = X_train_clean.shape
            w = np.zeros(n)
            b = 0
            
            # Train model
            w, b, costs, iterations_list = gradient_descent_with_regularization(
                X_train_clean, y_train_fold, w, b, learning_rate, iterations, lamb)
            
            # Evaluate on training set
            train_accuracy, train_precision, train_recall, train_f1 = evaluate_classification_model(
                X_train_clean, y_train_fold, w, b)
            
            # Evaluate on validation set
            val_accuracy, val_precision, val_recall, val_f1 = evaluate_classification_model(
                X_val_clean, y_val_fold, w, b)
            
            # Store results
            results[lamb]['train_accuracy'].append(train_accuracy)
            results[lamb]['val_accuracy'].append(val_accuracy)
            results[lamb]['train_precision'].append(train_precision)
            results[lamb]['val_precision'].append(val_precision)
            results[lamb]['train_recall'].append(train_recall)
            results[lamb]['val_recall'].append(val_recall)
            results[lamb]['train_f1'].append(train_f1)
            results[lamb]['val_f1'].append(val_f1)
    
    # Calculate average metrics across folds
    for lamb in lambda_values:
        original_metrics = list(results[lamb].keys())  # Create a copy of the keys
        for metric in original_metrics:                # Iterate over the copy
            results[lamb][f'avg_{metric}'] = np.mean(results[lamb][metric])
            results[lamb][f'std_{metric}'] = np.std(results[lamb][metric])    
    return results

def visualize_cv_results(cv_results):
    """
    Visualize cross-validation results
    
    Parameters:
    cv_results: Dictionary with cross-validation results
    """
    lambda_values = list(cv_results.keys())
    
    # Plot F1 scores
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 2, 1)
    
    train_f1_means = [cv_results[lamb]['avg_train_f1'] for lamb in lambda_values]
    val_f1_means = [cv_results[lamb]['avg_val_f1'] for lamb in lambda_values]
    val_f1_stds = [cv_results[lamb]['std_val_f1'] for lamb in lambda_values]
    
    plt.errorbar(lambda_values, val_f1_means, yerr=val_f1_stds, fmt='o-', label='Validation F1')
    plt.plot(lambda_values, train_f1_means, 'x-', label='Training F1')
    plt.xscale('log')
    plt.xlabel('Lambda (Regularization Parameter)')
    plt.ylabel('F1 Score')
    plt.title('F1 Score vs Lambda')
    plt.legend()
    plt.grid(True)
    
    # Plot Accuracy
    plt.subplot(2, 2, 2)
    
    train_acc_means = [cv_results[lamb]['avg_train_accuracy'] for lamb in lambda_values]
    val_acc_means = [cv_results[lamb]['avg_val_accuracy'] for lamb in lambda_values]
    val_acc_stds = [cv_results[lamb]['std_val_accuracy'] for lamb in lambda_values]
    
    plt.errorbar(lambda_values, val_acc_means, yerr=val_acc_stds, fmt='o-', label='Validation Accuracy')
    plt.plot(lambda_values, train_acc_means, 'x-', label='Training Accuracy')
    plt.xscale('log')
    plt.xlabel('Lambda (Regularization Parameter)')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Lambda')
    plt.legend()
    plt.grid(True)
    
    # Plot Precision
    plt.subplot(2, 2, 3)
    
    train_precision_means = [cv_results[lamb]['avg_train_precision'] for lamb in lambda_values]
    val_precision_means = [cv_results[lamb]['avg_val_precision'] for lamb in lambda_values]
    val_precision_stds = [cv_results[lamb]['std_val_precision'] for lamb in lambda_values]
    
    plt.errorbar(lambda_values, val_precision_means, yerr=val_precision_stds, fmt='o-', label='Validation Precision')
    plt.plot(lambda_values, train_precision_means, 'x-', label='Training Precision')
    plt.xscale('log')
    plt.xlabel('Lambda (Regularization Parameter)')
    plt.ylabel('Precision')
    plt.title('Precision vs Lambda')
    plt.legend()
    plt.grid(True)
    
    # Plot Recall
    plt.subplot(2, 2, 4)
    
    train_recall_means = [cv_results[lamb]['avg_train_recall'] for lamb in lambda_values]
    val_recall_means = [cv_results[lamb]['avg_val_recall'] for lamb in lambda_values]
    val_recall_stds = [cv_results[lamb]['std_val_recall'] for lamb in lambda_values]
    
    plt.errorbar(lambda_values, val_recall_means, yerr=val_recall_stds, fmt='o-', label='Validation Recall')
    plt.plot(lambda_values, train_recall_means, 'x-', label='Training Recall')
    plt.xscale('log')
    plt.xlabel('Lambda (Regularization Parameter)')
    plt.ylabel('Recall')
    plt.title('Recall vs Lambda')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Print best lambda based on validation F1 score
    best_lambda = lambda_values[np.argmax(val_f1_means)]
    print(f"\nBest lambda based on validation F1 score: {best_lambda}")
    
    # Print summary table of results
    print("\nSummary of Cross-Validation Results:")
    summary_df = pd.DataFrame({
        'Lambda': lambda_values,
        'Training F1': train_f1_means,
        'Validation F1': val_f1_means,
        'Validation F1 Std': val_f1_stds,
        'Training Accuracy': train_acc_means,
        'Validation Accuracy': val_acc_means,
        'Validation Accuracy Std': val_acc_stds,
        'Recall Accuracy': val_recall_means,
        'Recall Accuracy Std': val_recall_stds,
        'Precision Accuracy': val_precision_means,
        'Precision Accuracy Std': val_precision_stds,
    })
    print(summary_df.round(4))

# Example usage
lambda_values = [60, 70 , 80 , 90 , 100.0, 130, 150,  160.0  , 180 ,190 ,  200, 220, 240, 280 , 300 , 340  ]
    
X = final_df.drop('Churn', axis=1)
y = final_df['Churn']
print(X.columns)
# Run cross-validation with multiple lambda values
cv_results = cross_validate_model(X, y, n_folds=5, lambda_values=lambda_values, 
                                 learning_rate=0.5, iterations=10000)

# Visualize results
visualize_cv_results(cv_results)