In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import xgboost as xgb
from imblearn.combine import SMOTETomek
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

data = pd.read_csv(r"E:\Machine Learning\archive\WA_Fn-UseC_-Telco-Customer-Churn.csv")
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data = data.dropna(subset=['TotalCharges'])

numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

labels = {
    "Male": 1,
    "Female": 0,
    "Yes": 1,
    "No": 0,
    "No phone service": 0,
    "Fiber optic": 1,
    "DSL": 2,
    "No internet service": 0,
    "Month-to-month": 1,
    "Two year": 2,
    "One year": 3,
    "Electronic check": 1,
    "Mailed check": 2,
    "Bank transfer (automatic)": 3,
    "Credit card (automatic)": 4
}

columns_to_convert = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod', 'Churn'
]

for column in columns_to_convert:
    data[column] = data[column].map(labels)

X = data.drop('Churn', axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.3, stratify=y
)

estimator = xgb.XGBClassifier(
    booster='dart',
    n_estimators=350,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=2,
    reg_alpha=0.3,
    reg_lambda=0.8,
    sample_type='uniform',
    normalize_type='tree',
    rate_drop=0.1,
    skip_drop=0.1
)

estimator.fit(X_train, y_train)

cv_scores = cross_val_score(estimator, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores: ", cv_scores)
print("Mean cross-validation accuracy: ", cv_scores.mean())

test_accuracy = estimator.score(X_test, y_test)
print("Test accuracy: ", test_accuracy)

y_pred = estimator.predict(X_test)
y_pred_proba = estimator.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

fpr, tpr, roc_thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.figure()
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

feature_importances = pd.Series(estimator.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title('Feature Importances')
plt.show()

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y_train, palette='viridis')
plt.title('PCA of Training Data')
plt.show()

sns.pairplot(data, hue='Churn', diag_kind='kde')
plt.show()

corr_matrix = data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

missing_values = data.isnull().sum()
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_values.index, y=missing_values.values)
plt.xticks(rotation=90)
plt.title('Missing Values')
plt.show()

sns.kdeplot(data=data, x='MonthlyCharges', hue='Churn', fill=True, common_norm=False, palette='crest', alpha=0.5, linewidth=0)
plt.title('Distribution of Monthly Charges by Churn')
plt.show()

sns.kdeplot(data=data, x='TotalCharges', hue='Churn', fill=True, common_norm=False, palette='crest', alpha=0.5, linewidth=0)
plt.title('Distribution of Total Charges by Churn')
plt.show()

sns.boxplot(data=data, x='Churn', y='tenure')
plt.title('Tenure vs Churn')
plt.show()


In [63]:
from joblib import Parallel, delayed 
import joblib 

joblib.dump(best_model, 'Churn.pkl') 


['Churn.pkl']