In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

In [23]:
df = pd.read_csv('notebook/data/Customer-Churn-Records.csv')
df.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


### creating the preprocessor object and data transformer

In [5]:
def get_preprocessor_object():
    num_pipeline_scaling = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='mean')),
        ('scaler',StandardScaler())
    ])
    
    cat_pipeline = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('one_hot_encoding',OneHotEncoder(sparse=False)),
        ('scaler',StandardScaler())
    ])
    
    num_pipeline_missing = Pipeline(steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('scaler',StandardScaler())
    ])
    
    
    num_columns_scaling = ['CreditScore','Age','Balance','EstimatedSalary']
    num_columns_missing = ['Tenure','NumOfProducts','HasCrCard','IsActiveMember','Complain','EstimatedSalary','Satisfaction Score']
    cat_columns = ['Geography','Gender','Card Type']
    
    
    preprocessor = ColumnTransformer(
        [
            ('num_columns_missing',num_pipeline_missing, num_columns_missing),
            ('num_columns_scaling',num_pipeline_scaling,num_columns_scaling),
            ('cat_columns',cat_pipeline, cat_columns)
        ]
    )
    
    return preprocessor

In [6]:
def get_transformed_data(train_df, test_df):
    train_input_features = train_df.drop(columns=['Exited'])
    test_input_features = test_df.drop(columns=['Exited'])
    
    
    preprocessor = get_preprocessor_object()
    
    train_input_features_arr = preprocessor.fit_transform(train_input_features)
    test_input_features_arr = preprocessor.transform(test_input_features)
    
    target_column = 'Exited'
    train_target = train_df[target_column]
    test_target = test_df[target_column]
    
    train_features_arr = np.c_[
        train_input_features_arr,np.array(train_target)
    ]
    
    test_features_arr = np.c_[
        test_input_features_arr, np.array(test_target)
    ]
    
    
    return train_features_arr, test_features_arr

In [7]:
train_df , test_df = train_test_split(df, test_size=0.2 , random_state = 42)

In [8]:
print(train_df['Exited'].value_counts())
print(test_df['Exited'].value_counts())

Exited
0    6355
1    1645
Name: count, dtype: int64
Exited
0    1607
1     393
Name: count, dtype: int64


In [9]:
train_arr , test_arr = get_transformed_data(train_df, test_df)



In [10]:
print(train_arr.shape)
print(test_arr.shape)

(8000, 21)
(2000, 21)


### Model Building

In [11]:
X_train, X_test, y_train, y_test = (
    train_arr[:,:-1],
    test_arr[:,:-1],
    
    train_arr[:,-1],
    test_arr[:,-1]
)

In [12]:
models_dict ={
    'logistic_regression':LogisticRegression(),
    'decision_tree':DecisionTreeClassifier(),
    'random_forest':RandomForestClassifier(),
    'svm':SVC(),
    'KNN':KNeighborsClassifier(),
    'gradient_boosting':GradientBoostingClassifier(),
    'xgboost':XGBClassifier()
}

In [13]:
def evaluate_models(X_train, X_test, y_train, y_test, models):
    report = {}
    
    for i in range (len(models)):
        model = list(models.values())[i]
        
        model.fit(X_train, y_train)
        
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        training_accuracy = accuracy_score(y_train,y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)
        precision = precision_score(y_test, y_pred_test)
        recall = recall_score(y_test, y_pred_test)
        f1score = f1_score(y_test, y_pred_test)
        
        report[list(models.keys())[i]] = [training_accuracy,test_accuracy,precision,recall,f1score]
    
    report_df = pd.DataFrame.from_dict(data=report,orient='index').reset_index()
    report_df.rename(columns={
        'index':'models',
        0:'training_accuracy',
        1:'test_accuracy',
        2:'f1_score',
        3:'precision',
        4:'recall'
    },inplace=True)
        
    return report_df


In [14]:
report = evaluate_models(X_train,X_test, y_train, y_test , models_dict)
report

Unnamed: 0,models,training_accuracy,test_accuracy,f1_score,precision,recall
0,logistic_regression,0.9985,0.999,0.997455,0.997455,0.997455
1,decision_tree,1.0,0.997,0.994885,0.989822,0.992347
2,random_forest,1.0,0.999,0.997455,0.997455,0.997455
3,svm,0.9985,0.999,0.997455,0.997455,0.997455
4,KNN,0.993875,0.9865,1.0,0.931298,0.964427
5,gradient_boosting,0.9995,0.9985,0.994924,0.997455,0.996188
6,xgboost,0.999875,0.999,0.997455,0.997455,0.997455


**Note**
- In the data Complain feature is very highly correlated with Exited so it contributes about more than 15% of the model accuracy
- when standard scaler is applied in all the columns , non tree based algorithms are also performing well
- without standard scaler the Logistic regression, KNN and SVM were not performing well

### logistic regression , KNN and SVM are not performing well  (without standard scaler)

In [15]:
pd.read_csv('notebook/data/without_scaling.csv').drop(columns=['Unnamed: 0'])


Unnamed: 0,models,training_accuracy,test_accuracy,f1_score,precision,recall
0,logistic_regression,0.794375,0.8035,0.0,0.0,0.0
1,decision_tree,1.0,0.997,0.994885,0.989822,0.992347
2,random_forest,1.0,0.999,0.997455,0.997455,0.997455
3,svm,0.794375,0.8035,0.0,0.0,0.0
4,KNN,0.811375,0.76,0.2,0.073791,0.107807
5,gradient_boosting,0.9995,0.9985,0.994924,0.997455,0.996188
6,xgboost,0.999875,0.999,0.997455,0.997455,0.997455


**Observations**
- logistic regression precision and recall are zero because the true positive is zero
- It is not being able to classify True positive correctly because data might mostly contain the non linear relationship and  logistic regression is limited to capturing linear relationships between features and the target variable
- similarly there is also problem of difference in scale in the data i.e why these algorithms are not performing well compared to that of tree based algorithm

### Models doesnot perform well without Complain feature

In [16]:
pd.read_csv('notebook/data/without_Complain.csv').drop(columns=['Unnamed: 0'])

Unnamed: 0,models,training_accuracy,test_accuracy,f1_score,precision,recall
0,logistic_regression,0.811375,0.813,0.565517,0.208651,0.304833
1,decision_tree,1.0,0.7885,0.465116,0.508906,0.486027
2,random_forest,1.0,0.864,0.761905,0.447837,0.564103
3,svm,0.8665,0.8545,0.768421,0.371501,0.500858
4,KNN,0.85825,0.8205,0.591398,0.279898,0.379965
5,gradient_boosting,0.873625,0.864,0.741036,0.473282,0.57764
6,xgboost,0.968375,0.8565,0.673203,0.524173,0.589413


In [17]:
model = models_dict['gradient_boosting']

In [18]:
X_test

array([[-0.69539349,  0.80843615, -1.54035103, ...,  1.73262835,
        -0.58023704, -0.57388614],
       [-1.38944225,  0.80843615,  0.64920267, ..., -0.57715782,
        -0.58023704,  1.74250594],
       [-0.3483691 ,  0.80843615,  0.64920267, ...,  1.73262835,
        -0.58023704, -0.57388614],
       ...,
       [ 0.69270405, -0.91668767,  0.64920267, ..., -0.57715782,
         1.72343359, -0.57388614],
       [-0.3483691 , -0.91668767,  0.64920267, ..., -0.57715782,
         1.72343359, -0.57388614],
       [-1.38944225, -0.91668767,  0.64920267, ..., -0.57715782,
        -0.58023704,  1.74250594]])

In [19]:
X_test[2].reshape(1,-1)

array([[-0.3483691 ,  0.80843615,  0.64920267, -1.02583358, -0.50994211,
        -0.72797953, -1.43218616, -0.52560743,  0.48508334, -1.21847056,
        -0.72797953, -0.99850112, -0.57946723,  1.73494238,  1.09499335,
        -1.09499335, -0.57812007,  1.73262835, -0.58023704, -0.57388614]])

In [20]:
int(model.predict(X_test[100].reshape(1,-1))[0])

1

In [21]:
model.get_params(deep=True)

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}