## Step 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv('data/Churn_Modelling.csv')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### now that we have scuessfully imported and loaded our dataset, let's do the basics first 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
# well the data has no complains in data types, and data seems pretty structured and clean alreadr

In [6]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [9]:
# no outliers or anything , so far so good

#### now after the basics let's start removing the unwanted and useless columns 

In [10]:
df.drop(columns='RowNumber', inplace=True)

In [41]:
df.drop(columns='CustomerId', inplace=True)

In [42]:
# rest all the columns are required for our analysis , though customer id and surname are not pretty useful 
# we will see if evrythhing goes well then we will keep them othrwise we will drop them later

#### let's check for the categories of categorical columns for our deep understanding 

In [43]:
df['Gender'].value_counts() 

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [44]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [45]:
# nothing wrong with the data , we can proceed with splitiing input and output variables
X = df.drop(columns='Exited')
y = df['Exited']

In [46]:
X.head()

Unnamed: 0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88
1,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63
4,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


#### encoding the categorical fields

In [47]:
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

In [48]:
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])  # M=1, F=0


In [49]:
y.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

#### Train test splitting 

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , stratify=y)

In [51]:
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)

In [52]:
X_train_loo.head()

Unnamed: 0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2151,0.333333,753,0.158818,1,57,7,0.0,1,1,0,159475.08
8392,0.20375,739,0.330339,1,32,3,102128.27,1,1,0,63981.37
5006,0.0,755,0.330838,0,37,0,113865.23,2,1,1,117396.25
4117,0.222222,561,0.159068,1,37,5,0.0,2,1,0,83093.25
7182,0.0,692,0.330838,1,49,6,110540.43,2,0,1,107472.99


In [54]:
X_test_loo = encoder.transform(X_test)


#### Now we will do Modeling , starting with the most basic to the most advanced 

In [55]:
from sklearn.linear_model import LogisticRegression

In [56]:
lr= LogisticRegression(max_iter=1000)

In [58]:
lr.fit(X_train_loo, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
y_pred = lr.predict(X_test_loo)

In [60]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [61]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.811


## 🚀 Accuracy - 81 %

#### well not too bad for the most basic one , let's test it for other models as well

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [63]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [64]:
print("Model Accuracies:\n" + "-"*50)

for name, model in models.items():
    model.fit(X_train_loo, y_train)
    y_pred = model.predict(X_test_loo)

    acc = accuracy_score(y_test, y_pred)
    print(f"{name:<25}: {acc:.4f}")

Model Accuracies:
--------------------------------------------------
Decision Tree            : 0.7965
Random Forest            : 0.8160
Gradient Boosting        : 0.7965
Naive Bayes              : 0.7845
K-Nearest Neighbors      : 0.7640
Support Vector Machine   : 0.7965
XGBoost                  : 0.2035


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### okay so the best of all currently is Random forest model
### Accuracy - 81.60 %

### now we will do fine tuning and optimizations to achieve and Squeeze out more accuracy

In [78]:
from sklearn.model_selection import GridSearchCV

In [79]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}


In [81]:
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy'
)

grid_search.fit(X_train_loo, y_train)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


540 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\welcome\OneDrive\Desktop\CODSOFT\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\welcome\OneDrive\Desktop\CODSOFT\venv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\welcome\OneDrive\Desktop\CODSOFT\venv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    

In [82]:
best_model = grid_search.best_estimator_

In [83]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [84]:
y_prednew = best_model.predict(X_test_loo)

In [85]:
print("Accuracy:", accuracy_score(y_test, y_prednew))

Accuracy: 0.8185


## Final Accuracy - 81.85 %

In [95]:
import joblib

joblib.dump(best_model, 'fine_best_model.pkl')


['fine_best_model.pkl']