In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, plot_confusion_matrix, plot_roc_curve
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/content/Churn_Modelling.csv')

# Encode categorical variables
le = LabelEncoder()
df['Geography'] = le.fit_transform(df['Geography'])
df['Gender'] = le.fit_transform(df['Gender'])

# Define features and target variable
X = df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
y = df['Exited']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize/standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
lr = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Train Logistic Regression
lr.fit(X_train, y_train)

# Train Random Forest
rf.fit(X_train, y_train)

# Train Gradient Boosting
gb.fit(X_train, y_train)

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    return accuracy, precision, recall, f1, roc_auc

# Evaluate Logistic Regression
lr_metrics = evaluate_model(lr, X_test, y_test)
print("Logistic Regression: ", lr_metrics)

# Evaluate Random Forest
rf_metrics = evaluate_model(rf, X_test, y_test)
print("Random Forest: ", rf_metrics)

# Evaluate Gradient Boosting
gb_metrics = evaluate_model(gb, X_test, y_test)
print("Gradient Boosting: ", gb_metrics)

# Choose the best model (for example, Gradient Boosting) for further analysis
best_model = gb

# Plot confusion matrix
plot_confusion_matrix(best_model, X_test, y_test)
plt.show()

# Plot ROC curve
plot_roc_curve(best_model, X_test, y_test)
plt.show()


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/usr/local/lib/python3.10/dist-packages/sklearn/metrics/__init__.py)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load customer data from.csv file
customer_data = pd.read_csv('/content/Churn_Modelling.csv')

# Convert categorical variables to numerical variables using one-hot encoding
customer_data = pd.get_dummies(customer_data, columns=['Geography', 'Gender'])

# Drop non-numerical columns that are not relevant for prediction
customer_data = customer_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1) # Drop irrelevant columns

# Scale numerical variables using StandardScaler
scaler = StandardScaler()
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
customer_data[numerical_cols] = scaler.fit_transform(customer_data[numerical_cols])

# Split the data into training and testing sets
X = customer_data.drop(['Exited'], axis=1)
y = customer_data['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Logistic Regression classifier object
log_reg = LogisticRegression(max_iter=10000)

# Train the classifier on the training data
log_reg.fit(X_train, y_train)

# Predict churn for the test data
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model using accuracy score, classification report, and confusion matrix
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.3f}')
print('Logistic Regression Classification Report:')
print(classification_report(y_test, y_pred_log_reg))
print('Logistic Regression Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_log_reg))

# ... (rest of the code remains the same)

# Create a Random Forest classifier object
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
random_forest.fit(X_train, y_train)

# Predict churn for the test data
y_pred_random_forest = random_forest.predict(X_test)

# Evaluate the model using accuracy score, classification report, and confusion matrix
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
print(f'Random Forest Accuracy: {accuracy_random_forest:.3f}')
print('Random Forest Classification Report:')
print(classification_report(y_test, y_pred_random_forest))
print('Random Forest Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_random_forest))

# Create a Gradient Boosting classifier object
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
gradient_boosting.fit(X_train, y_train)

# Predict churn for the test data
y_pred_gradient_boosting = gradient_boosting.predict(X_test)

# Evaluate the model using accuracy score, classification report, and confusion matrix
accuracy_gradient_boosting = accuracy_score(y_test, y_pred_gradient_boosting)
print(f'Gradient Boosting Accuracy: {accuracy_gradient_boosting:.3f}')
print('Gradient Boosting Classification Report:')
print(classification_report(y_test, y_pred_gradient_boosting))
print('Gradient Boosting Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_gradient_boosting))

# Compare the accuracy of the models
print('Model Comparison:')
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.3f}')
print(f'Random Forest Accuracy: {accuracy_random_forest:.3f}')
print(f'Gradient Boosting Accuracy: {accuracy_gradient_boosting:.3f}')

Logistic Regression Accuracy: 0.811
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

Logistic Regression Confusion Matrix:
[[1543   64]
 [ 314   79]]
Random Forest Accuracy: 0.864
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000

Random Forest Confusion Matrix:
[[1545   62]
 [ 210  183]]
Gradient Boosting Accuracy: 0.864
Gradient Boosting Classification Report:
 