In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [63]:
# Function to clean data
filePath = r'C:\Users\DS\Downloads\CustomerChurn.csv'
df = pd.read_csv(filePath)
def cleaningData(df, drop_columns=None):
    # Check duplicates records
    duplicate = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate}")

    if duplicate > 0:
        df = df.drop_duplicates()
        print("Duplicate rows have been dropped.")
    
    if drop_columns:
        df = df.drop(columns=[col for col in drop_columns if col in df.columns], axis=1)
        print(f"Dropped columns: {drop_columns}")

    if 'Total Charges' in df.columns:
        df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

    return df
#check missing values 
print("Missing values before cleaning:")
print(df.isnull().sum())

# pass non-informative columns to cleanning method for drop
df = cleaningData(df, drop_columns=['LoyaltyID', 'Customer ID'])

# Drop rows with missing values (if any)
df.dropna(inplace=True)
print("Missing values after cleaning:")
print(df.isnull().sum())

# Encode categorical variables
categorical_features = ['Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 
                        'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 
                        'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 
                        'Paperless Billing', 'Payment Method', ' Churn']

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
# seperate target feature from whole data 
X = df.drop(columns=' Churn')
y = df[' Churn']

# Split the data into training and testing sets(80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Missing values before cleaning:
LoyaltyID            0
Customer ID          0
Senior Citizen       0
Partner              0
Dependents           0
Tenure               0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        1
 Churn               0
dtype: int64
Number of duplicate rows: 0
Dropped columns: ['LoyaltyID', 'Customer ID']
Missing values after cleaning:
Senior Citizen       0
Partner              0
Dependents           0
Tenure               0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0

In [64]:

# Function to evaluate model
def evaluate_model(X_train, X_test, y_train, y_test):
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    return accuracy, conf_matrix, class_report


In [65]:
# Apply filter method: Chi-Square on categorical features
select_k_best = SelectKBest(chi2, k=10)
X_train_chi = select_k_best.fit_transform(X_train, y_train)
X_test_chi = select_k_best.transform(X_test)

# Standardize the features after feature selection
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_chi)
X_test_scaled = scaler.transform(X_test_chi)
features_no = select_k_best.get_support()

# original feature names 
original_feature_names = X_train.columns
# names of the selected features
selected_feature_names = original_feature_names[features_no]
print("Number of original features:", X_train.shape[1])
print("Number of reduced features:", X_test_chi.shape[1])
print("Names of selected  features:")
for feature in selected_feature_names:
    print(feature)
# Evaluate model with Chi-Square selected features
print("\nModel Performance with Chi-Square Selected Features:")
chi_accuracy, chi_conf_matrix, chi_class_report = evaluate_model(X_train_chi, X_test_chi, y_train, y_test)
print(f"Accuracy: {chi_accuracy}")
print("Confusion Matrix:")
print(chi_conf_matrix)
print("Classification Report:")
print(chi_class_report)



Number of original features: 18
Number of reduced features: 10
Names of selected  features:
Senior Citizen
Dependents
Tenure
Online Security
Online Backup
Device Protection
Tech Support
Contract
Monthly Charges
Total Charges

Model Performance with Chi-Square Selected Features:
Accuracy: 0.7746979388770433
Confusion Matrix:
[[922 111]
 [206 168]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1033
           1       0.60      0.45      0.51       374

    accuracy                           0.77      1407
   macro avg       0.71      0.67      0.68      1407
weighted avg       0.76      0.77      0.76      1407



In [66]:
# Apply wrapper method: RFE with KNN
knn = KNeighborsClassifier(n_neighbors=5)
rfe = RFE(estimator=knn, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)
# Evaluate model with RFE selected features
print("\nModel Performance with RFE Selected Features:")
rfe_accuracy, rfe_conf_matrix, rfe_class_report = evaluate_model(X_train_rfe, X_test_rfe, y_train, y_test)
print(f"Accuracy: {rfe_accuracy}")
print("Confusion Matrix:")
print(rfe_conf_matrix)
print("Classification Report:")
print(rfe_class_report)


Model Performance with RFE Selected Features:
Accuracy: 0.7697228144989339
Confusion Matrix:
[[900 133]
 [191 183]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1033
           1       0.58      0.49      0.53       374

    accuracy                           0.77      1407
   macro avg       0.70      0.68      0.69      1407
weighted avg       0.76      0.77      0.76      1407



In [67]:
# Apply embedded method: Feature importance from RandomForest
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
importances = rf.feature_importances_
indices = importances.argsort()[-10:]  # Select top 10 features

X_train_embedded = X_train_scaled[:, indices]
X_test_embedded = X_test_scaled[:, indices]

# Evaluate model with embedded method selected features
print("\nModel Performance with Embedded Method Selected Features:")
embed_accuracy, embed_conf_matrix, embed_class_report = evaluate_model(X_train_embedded, X_test_embedded, y_train, y_test)
print(f"Accuracy: {embed_accuracy}")
print("Confusion Matrix:")
print(embed_conf_matrix)
print("Classification Report:")
print(embed_class_report)



Model Performance with Embedded Method Selected Features:
Accuracy: 0.7697228144989339
Confusion Matrix:
[[900 133]
 [191 183]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1033
           1       0.58      0.49      0.53       374

    accuracy                           0.77      1407
   macro avg       0.70      0.68      0.69      1407
weighted avg       0.76      0.77      0.76      1407



In [70]:
# Compare the results
print("\nComparison of Model Performance:")
print(f"Original Model Accuracy: {accuracy}")
print(f"Chi-Square Selected Features Model Accuracy: {chi_accuracy}")
print(f"Wrapper Selected Features Model Accuracy: {rfe_accuracy}")
print(f"Embedded Method Selected Features Model Accuracy: {embed_accuracy}")


Comparison of Model Performance:
Original Model Accuracy: 0.7412935323383084
Chi-Square Selected Features Model Accuracy: 0.7746979388770433
Wrapper Selected Features Model Accuracy: 0.7697228144989339
Embedded Method Selected Features Model Accuracy: 0.7697228144989339
