In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# 1. Load dataset and encode categorical variables
dataset = pd.read_csv('Churn_Prediction_Preprocessed.csv')
dataset = pd.get_dummies(dataset, drop_first=True)

# 2. Define features and target
X = dataset.drop('churn_flag', axis=1)
y = dataset['churn_flag']

# 3. Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
# 4. Apply SMOTE only to training data
smote = SMOTE(random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nOriginal training set class distribution:\n{y_train.value_counts()}")
print(f"Resampled training set class distribution:\n{pd.Series(y_train_resampled).value_counts()}")


Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64


In [4]:
#def evaluate_logistic_regression_pca(n_components):
# 5. Scale features (fit scaler on resampled training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [29]:
# 6. Apply PCA
pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
    
# 7. Train logistic regression
clf = RandomForestClassifier(random_state=0, n_estimators=100)
clf.fit(X_train_pca, y_train_resampled)
    
# 8. Predict on test set
y_pred = clf.predict(X_test_pca)
    
# 9. Evaluate performance
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [12]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=2) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=2) ===
Accuracy: 0.8270
Confusion Matrix:
[[819 105]
 [ 68   8]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.89      0.90       924
        True       0.07      0.11      0.08        76

    accuracy                           0.83      1000
   macro avg       0.50      0.50      0.49      1000
weighted avg       0.86      0.83      0.84      1000



In [28]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=3) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=3) ===
Accuracy: 0.8600
Confusion Matrix:
[[849  75]
 [ 65  11]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.92      0.92       924
        True       0.13      0.14      0.14        76

    accuracy                           0.86      1000
   macro avg       0.53      0.53      0.53      1000
weighted avg       0.87      0.86      0.86      1000



In [31]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=4) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=4) ===
Accuracy: 0.8730
Confusion Matrix:
[[866  58]
 [ 69   7]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.94      0.93       924
        True       0.11      0.09      0.10        76

    accuracy                           0.87      1000
   macro avg       0.52      0.51      0.52      1000
weighted avg       0.86      0.87      0.87      1000



In [30]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=5) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=5) ===
Accuracy: 0.8730
Confusion Matrix:
[[866  58]
 [ 69   7]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.94      0.93       924
        True       0.11      0.09      0.10        76

    accuracy                           0.87      1000
   macro avg       0.52      0.51      0.52      1000
weighted avg       0.86      0.87      0.87      1000



In [24]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=5) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=5) ===
Accuracy: 0.8800
Confusion Matrix:
[[874  50]
 [ 70   6]]
Classification Report:
              precision    recall  f1-score   support

       False       0.93      0.95      0.94       924
        True       0.11      0.08      0.09        76

    accuracy                           0.88      1000
   macro avg       0.52      0.51      0.51      1000
weighted avg       0.86      0.88      0.87      1000



In [26]:
# 10. Print results
print(f"\n=== Random Forest with PCA (n_components=6) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Random Forest with PCA (n_components=6) ===
Accuracy: 0.8920
Confusion Matrix:
[[890  34]
 [ 74   2]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.96      0.94       924
        True       0.06      0.03      0.04        76

    accuracy                           0.89      1000
   macro avg       0.49      0.49      0.49      1000
weighted avg       0.86      0.89      0.87      1000

