# CUSTOMER CHURN PREDICTION
## Develop a model to predict customer churn for a subscription- based service or business. Use historical customer data, including features like usage behavior and customer demographics, and try algorithms like Logistic Regression, Random Forests, or Gradient Boosting to predict churn.

In [1]:
# importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# Assuming 'Exited' is the target variable
target_variable = 'Exited'

In [16]:
# Loading the dataset
data = pd.read_csv("D:\\Codesoft\\churn\\Churn_Modelling.csv")

In [4]:
# Exploring the data
print(data.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [5]:
# Data Preprocessing
data = data.drop(['RowNumber', 'CustomerId'], axis=1)  # Drop irrelevant columns

In [6]:
# Converting categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)

In [7]:
# Droping 'Surname' column for now 
data = data.drop(['Surname'], axis=1)

In [8]:
# Spliting the data into features (X) and target variable (y)
X = data.drop(target_variable, axis=1)
y = data[target_variable]

In [9]:
# Handling imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [10]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [11]:
# Standardizing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Model Training and Evaluation

# Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}
gb_grid = GridSearchCV(estimator=gb_model, param_grid=gb_params, scoring='accuracy', cv=5)
gb_grid.fit(X_train_scaled, y_train)
gb_best_model = gb_grid.best_estimator_
gb_predictions = gb_best_model.predict(X_test_scaled)
print("Gradient Boosting:")
print("Best Parameters:", gb_grid.best_params_)
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print(confusion_matrix(y_test, gb_predictions))
print(classification_report(y_test, gb_predictions))

Gradient Boosting:
Best Parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
Accuracy: 0.8772755806654112
[[1412  221]
 [ 170 1383]]
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      1633
           1       0.86      0.89      0.88      1553

    accuracy                           0.88      3186
   macro avg       0.88      0.88      0.88      3186
weighted avg       0.88      0.88      0.88      3186



In [15]:
# Accepting the user input for features
user_input = {
    'CreditScore': float(input("Enter Credit Score: ")),
    'Age': float(input("Enter Age: ")),
    'Tenure': float(input("Enter Tenure: ")),
    'Balance': float(input("Enter Balance: ")),
    'NumOfProducts': float(input("Enter Number of Products: ")),
    'HasCrCard': float(input("Enter 1 if Has Credit Card, 0 otherwise: ")),
    'IsActiveMember': float(input("Enter 1 if Active Member, 0 otherwise: ")),
    'EstimatedSalary': float(input("Enter Estimated Salary: ")),
    'Geography_Germany': float(input("Enter 1 if in Germany, 0 otherwise: ")),
    'Geography_Spain': float(input("Enter 1 if in Spain, 0 otherwise: ")),
    'Gender_Male': float(input("Enter 1 if Male, 0 otherwise: ")),
}

# Converting user input to DataFrame
user_input_df = pd.DataFrame([user_input])

# Standardizing user input
user_input_scaled = scaler.transform(user_input_df)

# Making predictions on the user input
user_predictions = gb_best_model.predict(user_input_scaled)

print('\nPredicted Churn Status for User Input:')
print("Churn" if user_predictions[0] == 1 else "No Churn")


Enter Credit Score: 619
Enter Age: 35
Enter Tenure: 7
Enter Balance: 25000
Enter Number of Products: 4
Enter 1 if Has Credit Card, 0 otherwise: 1
Enter 1 if Active Member, 0 otherwise: 1
Enter Estimated Salary: 100250
Enter 1 if in Germany, 0 otherwise: 1
Enter 1 if in Spain, 0 otherwise: 0
Enter 1 if Male, 0 otherwise: 1

Predicted Churn Status for User Input:
No Churn
