In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [35]:
df = pd.read_csv('Churn_Data_Cleaned.csv')

In [36]:
# Drop unnecessary columns
df_prepared = df.drop(columns=[
    'surname', 'surname_tfidf_0', 'surname_tfidf_1', 'surname_tfidf_2',
    'surname_tfidf_3', 'surname_tfidf_4'
], errors='ignore')

In [37]:
if 'geography' not in df.columns:
    df['geography'] = df[['france', 'germany', 'spain']].idxmax(axis=1)
if 'gender' not in df.columns:
    df['gender'] = df[['female', 'male']].idxmax(axis=1)

In [39]:
X = df.drop('exited', axis=1)
y = df['exited']

In [40]:
# Identify categorical and numerical features for the preprocessor
categorical_features = ['geography', 'gender']
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns


In [41]:
# Create the preprocessing pipelines for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

In [42]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [43]:
# Define the models to be tested
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

results = {}
best_model_name = ''
best_f1_score = 0.0
best_model_pipeline = None

for name, model in models.items():
    # Create the full pipeline with preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])

In [44]:
 # Train the model
print(f"--- Training {name} ---")
pipeline.fit(X_train, y_train)

--- Training XGBoost ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [45]:
 # Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [46]:
 # Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
f1 = report['weighted avg']['f1-score']

results[name] = {
    'accuracy': accuracy,
    'f1_score': f1,
    'report': classification_report(y_test, y_pred)
}

print(f"Results for {name}:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1-Score: {f1:.4f}")
print(results[name]['report'])

Results for XGBoost:
  Accuracy: 0.8656
  F1-Score: 0.8583
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     27615
           1       0.74      0.56      0.64      7391

    accuracy                           0.87     35006
   macro avg       0.81      0.75      0.78     35006
weighted avg       0.86      0.87      0.86     35006



In [47]:
# Save the best model based on F1-score
if f1 > best_f1_score:
        best_f1_score = f1
        best_model_name = name
        best_model_pipeline = pipeline

In [48]:
# Generate and save confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Exited', 'Exited'], yticklabels=['Not Exited', 'Exited'])
plt.title(f'Confusion Matrix - {name}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig(f'confusion_matrix_{name.replace(" ", "_")}.png')
plt.close()

In [49]:
# --- 3. Model Selection ---
print(f"\n--- Model Selection Summary ---")
print(f"The best performing model is: **{best_model_name}** with a weighted F1-score of {best_f1_score:.4f}")



--- Model Selection Summary ---
The best performing model is: **XGBoost** with a weighted F1-score of 0.8583


In [50]:
# --- 4. Save the Best Model ---
joblib.dump(best_model_pipeline, 'best_churn_model.pkl')
print("\nThe best model has been saved as 'best_churn_model.pkl'")


The best model has been saved as 'best_churn_model.pkl'
