In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from collections import Counter

In [2]:
df = pd.read_csv("kk.csv")
df.shape

(100, 5)

In [3]:
df['churn'].value_counts()

1    79
0    21
Name: churn, dtype: int64

## Data Preprocessing

In [4]:
df.isnull().sum()

effort    0
vi        0
time      0
churn     0
id        0
dtype: int64

In [5]:
df.head()

Unnamed: 0,effort,vi,time,churn,id
0,222,3.0,74,1,1
1,128,4.0,32,1,2
2,114,5.0,23,1,3
3,88,3.0,29,1,4
4,335,5.0,67,1,5


# Train and Test Split

In [6]:
x = df.drop(['id'], axis=1)
y = df['churn']
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [7]:
y_train.shape, y_test.shape

((80,), (20,))

In [8]:
y_train.value_counts()/len(y_train)

1    0.7875
0    0.2125
Name: churn, dtype: float64

In [9]:
y_test.value_counts()/len(y_test)

1    0.8
0    0.2
Name: churn, dtype: float64

In [10]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

SMOTE Technique

In [11]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
counter = Counter(y_train)
print('Before',counter)
smt = SMOTE()

X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After',counter)




Before Counter({1: 63, 0: 17})
After Counter({0: 63, 1: 63})




# Model Building - Imbalanced data

In [12]:
model = list()
resample = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()

In [13]:
def test_eval(clf_model, X_test, y_test, algo=None, sampling=None):
    
    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('AUC-ROC')
    print('='*60)
    print(roc_auc_score(y_test, y_prob[:,1]))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
    AUCROC.append(roc_auc_score(y_test, y_prob[:,1]))
    resample.append(sampling)


## Model-1: Logistic Regression

### 1. `Original Unsampled Data`

In [14]:
log_model=LogisticRegression()

params={'C':np.logspace(-10, 1, 15),'class_weight':[None,'balanced'],'penalty':['l1','l2']}

cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1)
clf_LR.fit(X_train, y_train)
clf_LR.best_estimator_

LogisticRegression(C=1e-10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'actual')

Confusion Matrix
[[ 0  4]
 [ 0 16]] 

Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.80      1.00      0.89        16

    accuracy                           0.80        20
   macro avg       0.40      0.50      0.44        20
weighted avg       0.64      0.80      0.71        20
 

AUC-ROC
1.0


  _warn_prf(average, modifier, msg_start, len(result))


### `2.SMOTE Resampling`

In [16]:
clf_LR.fit(X_train_sm, y_train_sm)
clf_LR.best_estimator_

LogisticRegression(C=1e-10, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression', 'smote')

Confusion Matrix
[[ 4  0]
 [ 0 16]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00        16

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20
 

AUC-ROC
1.0
