## I. IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('new_churn_data.csv')

## II. DATA PREPARATION

In [3]:
## Data Preparation
user_identifier = dataset['user']
dataset = dataset.drop(columns=['user'])

In [4]:
## One-Hot Encoding
dataset.housing.value_counts()
dataset = pd.get_dummies(dataset)
dataset.columns

Index(['churn', 'age', 'deposits', 'withdrawal', 'purchases_partners',
       'purchases', 'cc_taken', 'cc_recommended', 'cc_disliked', 'cc_liked',
       'cc_application_begin', 'app_downloaded', 'web_user', 'ios_user',
       'android_user', 'registered_phones', 'waiting_4_loan', 'cancelled_loan',
       'received_loan', 'rejected_loan', 'left_for_two_month_plus',
       'left_for_one_month', 'reward_rate', 'is_referred', 'housing_O',
       'housing_R', 'housing_na', 'payment_type_Bi-Weekly',
       'payment_type_Monthly', 'payment_type_Semi-Monthly',
       'payment_type_Weekly', 'payment_type_na', 'zodiac_sign_Aquarius',
       'zodiac_sign_Aries', 'zodiac_sign_Cancer', 'zodiac_sign_Capricorn',
       'zodiac_sign_Gemini', 'zodiac_sign_Leo', 'zodiac_sign_Libra',
       'zodiac_sign_Pisces', 'zodiac_sign_Sagittarius', 'zodiac_sign_Scorpio',
       'zodiac_sign_Taurus', 'zodiac_sign_Virgo', 'zodiac_sign_na'],
      dtype='object')

In [7]:
dataset = dataset.drop(columns=['housing_na', 'zodiac_sign_na', 'payment_type_na'])

In [6]:
dataset

Unnamed: 0,churn,age,deposits,withdrawal,purchases_partners,purchases,cc_taken,cc_recommended,cc_disliked,cc_liked,...,zodiac_sign_Capricorn,zodiac_sign_Gemini,zodiac_sign_Leo,zodiac_sign_Libra,zodiac_sign_Pisces,zodiac_sign_Sagittarius,zodiac_sign_Scorpio,zodiac_sign_Taurus,zodiac_sign_Virgo,zodiac_sign_na
0,0,37.0,0,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,False
1,0,28.0,0,0,1,0,0,96,0,0,...,False,False,True,False,False,False,False,False,False,False
2,0,35.0,47,2,86,47,0,285,0,0,...,True,False,False,False,False,False,False,False,False,False
3,0,26.0,26,3,38,25,0,74,0,0,...,True,False,False,False,False,False,False,False,False,False
4,1,27.0,0,0,2,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26991,1,24.0,0,0,0,0,0,81,0,0,...,False,False,True,False,False,False,False,False,False,False
26992,1,26.0,0,0,2,0,0,1,0,0,...,False,False,False,False,False,False,False,False,False,False
26993,0,22.0,0,0,37,0,0,98,0,0,...,False,False,False,False,False,False,False,True,False,False
26994,1,46.0,2,0,16,2,0,58,0,0,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=['churn']), dataset['churn'],
                                                    test_size = 0.2,
                                                    random_state = 0)

In [9]:
# Balancing the Training set
y_train.value_counts()

Unnamed: 0_level_0,count
churn,Unnamed: 1_level_1
0,12656
1,8940


In [10]:
pos_index = y_train[y_train.values == 1].index
neg_index = y_train[y_train.values == 0].index

In [11]:
if len(pos_index) > len(neg_index):
  higher = pos_index
  lower = neg_index
else:
  higher = neg_index
  lower = pos_index

In [12]:
random.seed(0)
higher = np.random.choice(higher, size=len(lower))
lower = np.asarray(lower)

In [13]:
new_indexes = np.concatenate((lower, higher))

In [14]:
X_train = X_train.loc[new_indexes, ]
y_train = y_train[new_indexes]

In [15]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values

## III. MODEL BUILDING

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train2, y_train)

In [18]:
# Predicting Test Set
y_pred = classifier.predict(X_test2)

In [20]:
# Evaluating Results
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1672 1494]
 [ 578 1656]]


In [21]:
accuracy_score(y_test, y_pred)

0.6162962962962963

In [22]:
f1_score(y_test, y_pred)

0.6151560178306092

In [23]:
precision_score(y_test, y_pred)

0.5257142857142857

In [24]:
recall_score(y_test, y_pred)

0.7412712623097583

In [25]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train2, y=y_train, cv=10)

In [26]:
accuracies.mean()

np.float64(0.64082774049217)

In [27]:
# Analyzing Coefficients
pd.concat([pd.DataFrame(X_train2.columns, columns=['features']),
           pd.DataFrame(np.transpose(classifier.coef_), columns=["coef"])],
          axis=1)

Unnamed: 0,features,coef
0,age,-0.189932
1,deposits,0.479216
2,withdrawal,0.053189
3,purchases_partners,-0.744847
4,purchases,-0.627448
5,cc_taken,0.054178
6,cc_recommended,0.120321
7,cc_disliked,-0.016553
8,cc_liked,0.000359
9,cc_application_begin,0.044014


## IV. FEATURE SELECTION

In [28]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [30]:
# Model to Test
classifier = LogisticRegression()
rfe = RFE(estimator=classifier, n_features_to_select=20)
rfe = rfe.fit(X_train2, y_train)

In [31]:
# summarize the selection of the attributes
rfe.ranking_

array([ 1,  1,  1,  1,  1,  1,  1, 12, 22,  1,  3,  1,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  1,  4,  7, 10,  1,  9,  1, 13,  8, 17, 11,  6, 19,
       14, 21,  5, 16, 20, 18, 15])

In [32]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train2[X_train2.columns[rfe.support_]], y_train)

In [33]:
# Predicting test set
y_pred = classifier.predict(X_test2[X_test2.columns[rfe.support_]])

In [34]:
# Evaluating results
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1637 1529]
 [ 587 1647]]


In [35]:
accuracy_score(y_test, y_pred)

0.6081481481481481

In [36]:
f1_score(y_test, y_pred)

0.6088724584103512

In [37]:
recall_score(y_test, y_pred)

0.7372426141450313

In [39]:
precision_score(y_test, y_pred)

0.5185768261964736

In [40]:
# Analyzing Coefficients
pd.concat([pd.DataFrame(X_train2.columns[rfe.support_], columns=['features']),
           pd.DataFrame(np.transpose(classifier.coef_), columns=["coef"])],
          axis=1)

Unnamed: 0,features,coef
0,age,-0.19892
1,deposits,0.470418
2,withdrawal,0.056777
3,purchases_partners,-0.731655
4,purchases,-0.615442
5,cc_taken,0.049983
6,cc_recommended,0.12714
7,cc_application_begin,0.04982
8,web_user,0.120987
9,ios_user,0.062607


In [42]:
# Formatting Final Results
final_results = pd.concat([y_test, user_identifier], axis=1).dropna()
final_results['predicted_churn'] = y_pred
final_results = final_results[['user', 'churn', 'predicted_churn']].reset_index(drop=True)

In [43]:
final_results

Unnamed: 0,user,churn,predicted_churn
0,53016,1.0,1
1,54963,0.0,0
2,10511,0.0,0
3,64269,0.0,1
4,30978,0.0,0
...,...,...,...
5395,50513,1.0,0
5396,42775,0.0,1
5397,15284,1.0,1
5398,10432,1.0,0
