In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [31]:
data = pd.read_csv("/content/drive/MyDrive/DATASETS/Customer churn/Churn_Modelling.csv")

In [35]:
# Display few rows to check the data
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [33]:
# Drop irrelevant columns
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [34]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [7]:
# Split the data into training and testing sets
X = data.drop('Exited', axis=1)
y = data['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Check the shapes of the processed data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 10), (2000, 10), (8000,), (2000,))

Training and evaluation

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score


In [11]:
# Initialize the models
log_reg = LogisticRegression(random_state=42)
rand_forest = RandomForestClassifier(random_state=42)
grad_boost = GradientBoostingClassifier(random_state=42)

In [12]:
# Train and evaluate Logistic Regression
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_report = classification_report(y_test, y_pred_log_reg)

In [13]:
# Train and evaluate Random Forest
rand_forest.fit(X_train, y_train)
y_pred_rand_forest = rand_forest.predict(X_test)
rand_forest_accuracy = accuracy_score(y_test, y_pred_rand_forest)
rand_forest_report = classification_report(y_test, y_pred_rand_forest)

In [14]:
# Train and evaluate Gradient Boosting
grad_boost.fit(X_train, y_train)
y_pred_grad_boost = grad_boost.predict(X_test)
grad_boost_accuracy = accuracy_score(y_test, y_pred_grad_boost)
grad_boost_report = classification_report(y_test, y_pred_grad_boost)

In [15]:
# Display the results
results = {
    'Logistic Regression Accuracy': log_reg_accuracy,
    'Random Forest Accuracy': rand_forest_accuracy,
    'Gradient Boosting Accuracy': grad_boost_accuracy
}

print(results)

{'Logistic Regression Accuracy': 0.8155, 'Random Forest Accuracy': 0.8645, 'Gradient Boosting Accuracy': 0.866}


## Conclusion

- The Gradient Boosting model achieved the highest accuracy, slightly outperforming the Random Forest model.
- Logistic Regression performed the worst, indicating that a linear model may not be ideal for this problem.
- Further model tuning, feature engineering, or using more advanced techniques like ensemble methods could improve results.


In [52]:
import numpy as np

def manual_test(model, scaler, user_input):

    if user_input[1] == 'Germany':
        user_input[1] = 0
    elif user_input[1] == 'Spain':
        user_input[1] = 1
    elif user_input[1] == 'France':
        user_input[1] = 2

    if user_input[2] == 'Female':
        user_input[2] = 0
    elif user_input[2] == 'Male':
        user_input[2] = 1

    input_array = np.array(user_input).reshape(1, -1)

    if scaler:
        input_array = scaler.transform(input_array)

    prediction = model.predict(input_array)

    if prediction[0] == 1:
        print("The customer is likely to churn.")
    else:
        print("The customer is likely to stay.")

    return prediction

In [57]:
data.sample()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3643,679,France,Male,45,3,146758.24,1,1,0,48466.89,0


-# The array must include CreditScore, Geography,	Gender,	Age, Tenure, Balance,	NumOfProducts, HasCrCard,	IsActiveMember and EstimatedSalary values.
-# Also 1 means "yes" and 0 means "no" in the columns 'HasCrCard', 'IsActiveMember' and 'Exited'.

In [53]:
manual_test(grad_boost, scaler, [700, 'Germany', 'Male', 38, 5, 70000.50, 2, 1, 1, 90000.00])

The customer is likely to stay.




array([0])

In [56]:
manual_test(grad_boost, scaler, [500, 'Spain', 'Female', 25, 6, 90000.50, 3, 1, 1, 120000.00])

The customer is likely to churn.




array([1])