In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# 1. Load dataset and encode categorical variables
dataset = pd.read_csv('Churn_Prediction_Preprocessed.csv')
dataset = pd.get_dummies(dataset, drop_first=True)

# 2. Define features and target
X = dataset.drop('churn_flag', axis=1)
y = dataset['churn_flag']

# 3. Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
# 4. Apply SMOTE only to training data
smote = SMOTE(random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nOriginal training set class distribution:\n{y_train.value_counts()}")
print(f"Resampled training set class distribution:\n{pd.Series(y_train_resampled).value_counts()}")


Original training set class distribution:
churn_flag
False    3590
True      410
Name: count, dtype: int64
Resampled training set class distribution:
churn_flag
False    3590
True     3590
Name: count, dtype: int64


In [9]:
#def evaluate_logistic_regression_pca(n_components):
# 5. Scale features (fit scaler on resampled training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [10]:
# 6. Apply PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
    
# 7. Train logistic regression
clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X_train_pca, y_train_resampled)
    
# 8. Predict on test set
y_pred = clf.predict(X_test_pca)
    
# 9. Evaluate performance
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [13]:
# 10. Print results
print(f"\n=== Logistic Regression with PCA (n_components=2) ===")
print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)


=== Logistic Regression with PCA (n_components=2) ===
Accuracy: 0.4810
Confusion Matrix:
[[446 478]
 [ 41  35]]
Classification Report:
              precision    recall  f1-score   support

       False       0.92      0.48      0.63       924
        True       0.07      0.46      0.12        76

    accuracy                           0.48      1000
   macro avg       0.49      0.47      0.38      1000
weighted avg       0.85      0.48      0.59      1000

