In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
dataset = pd.read_csv('Churn_Prediction_Preprocessed.csv')
dataset = pd.get_dummies(dataset, drop_first=True)

X = dataset.drop('churn_flag', axis=1)
y = dataset['churn_flag']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
def evaluate_logistic_regression_pca(n_components):
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train logistic regression
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train_pca, y_train)

    # Predictions
    y_pred = clf.predict(X_test_pca)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    # Output
    print(f"\n=== Logistic Regression with PCA (n_components={n_components}) ===")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(cr)

    return acc

In [9]:
# Example usage
accuracy_logistic_pca_2 = evaluate_logistic_regression_pca(2)


=== Logistic Regression with PCA (n_components=2) ===
Accuracy: 0.9240
Confusion Matrix:
[[924   0]
 [ 76   0]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      1.00      0.96       924
        True       0.00      0.00      0.00        76

    accuracy                           0.92      1000
   macro avg       0.46      0.50      0.48      1000
weighted avg       0.85      0.92      0.89      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
accuracy_logistic_pca_1 = evaluate_logistic_regression_pca(1)


=== Logistic Regression with PCA (n_components=1) ===
Accuracy: 0.9240
Confusion Matrix:
[[924   0]
 [ 76   0]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      1.00      0.96       924
        True       0.00      0.00      0.00        76

    accuracy                           0.92      1000
   macro avg       0.46      0.50      0.48      1000
weighted avg       0.85      0.92      0.89      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.decomposition import PCA

def evaluate_logistic_regression_with_smote(n_components):
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Apply SMOTE to training data
    smote = SMOTE(random_state=0)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

    print(f"\nOriginal training set class distribution:\n{y_train.value_counts()}")
    print(f"Resampled training set class distribution:\n{pd.Series(y_train_resampled).value_counts()}")

    # Train Logistic Regression
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train_resampled, y_train_resampled)

    # Predict on test set
    y_pred = clf.predict(X_test_pca)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    # Print results
    print(f"\n=== Logistic Regression with SMOTE + PCA (n_components={n_components}) ===")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(cr)

    return acc


In [14]:
accuracy_smote = evaluate_logistic_regression_with_smote(n_components=2)



Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64

=== Logistic Regression with SMOTE + PCA (n_components=2) ===
Accuracy: 0.4110
Confusion Matrix:
[[358 566]
 [ 23  53]]
Classification Report:
              precision    recall  f1-score   support

       False       0.94      0.39      0.55       924
        True       0.09      0.70      0.15        76

    accuracy                           0.41      1000
   macro avg       0.51      0.54      0.35      1000
weighted avg       0.87      0.41      0.52      1000



In [15]:
accuracy_smote = evaluate_logistic_regression_with_smote(n_components=3)



Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64

=== Logistic Regression with SMOTE + PCA (n_components=3) ===
Accuracy: 0.4430
Confusion Matrix:
[[397 527]
 [ 30  46]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.43      0.59       924
        True       0.08      0.61      0.14        76

    accuracy                           0.44      1000
   macro avg       0.51      0.52      0.36      1000
weighted avg       0.87      0.44      0.55      1000



In [16]:
accuracy_smote = evaluate_logistic_regression_with_smote(n_components=4)



Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64

=== Logistic Regression with SMOTE + PCA (n_components=4) ===
Accuracy: 0.4680
Confusion Matrix:
[[424 500]
 [ 32  44]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.46      0.61       924
        True       0.08      0.58      0.14        76

    accuracy                           0.47      1000
   macro avg       0.51      0.52      0.38      1000
weighted avg       0.87      0.47      0.58      1000



In [17]:
accuracy_smote = evaluate_logistic_regression_with_smote(n_components=5)



Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64

=== Logistic Regression with SMOTE + PCA (n_components=5) ===
Accuracy: 0.5080
Confusion Matrix:
[[475 449]
 [ 43  33]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.51      0.66       924
        True       0.07      0.43      0.12        76

    accuracy                           0.51      1000
   macro avg       0.49      0.47      0.39      1000
weighted avg       0.85      0.51      0.62      1000



In [20]:
accuracy_smote = evaluate_logistic_regression_with_smote(n_components=6)


Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64

=== Logistic Regression with SMOTE + PCA (n_components=6) ===
Accuracy: 0.5120
Confusion Matrix:
[[477 447]
 [ 41  35]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.52      0.66       924
        True       0.07      0.46      0.13        76

    accuracy                           0.51      1000
   macro avg       0.50      0.49      0.39      1000
weighted avg       0.86      0.51      0.62      1000

