In [1]:
import pandas as pd
merchant_level = pd.read_excel('Data Set.xlsx')
merchant_level.head()

Unnamed: 0,merchant,time,amount_usd_in_cents
0,faa029c6b0,2034-06-17 23:34:14,6349
1,ed7a7d91aa,2034-12-27 00:40:38,3854
2,5608f200cf,2034-04-30 01:29:42,789
3,15b1a0d61e,2034-09-16 01:06:23,4452
4,4770051790,2034-07-22 16:21:42,20203


In [2]:
# There is no null values in the data
merchant_level.isnull().sum()

merchant               0
time                   0
amount_usd_in_cents    0
dtype: int64

# Removing outliers if any

In [3]:
Q1 = merchant_level.amount_usd_in_cents.quantile(0.25)
Q3 = merchant_level.amount_usd_in_cents.quantile(0.75)
Q1, Q3

(3220.0, 12467.25)

In [4]:
IQR = Q3 - Q1
IQR

9247.25

In [5]:
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
lower_limit, upper_limit

(-10650.875, 26338.125)

In [6]:
# lets change the date into quarters
merchant_level['quarter'] = merchant_level['time'].dt.quarter

In [7]:
merchant_level

Unnamed: 0,merchant,time,amount_usd_in_cents,quarter
0,faa029c6b0,2034-06-17 23:34:14,6349,2
1,ed7a7d91aa,2034-12-27 00:40:38,3854,4
2,5608f200cf,2034-04-30 01:29:42,789,2
3,15b1a0d61e,2034-09-16 01:06:23,4452,3
4,4770051790,2034-07-22 16:21:42,20203,3
...,...,...,...,...
100407,4f65280858,2033-11-17 19:45:58,7490,4
100408,d701c0b3db,2034-01-20 15:52:07,93743,1
100409,4b683b2bc5,2034-08-21 23:08:25,10472,3
100410,fc7f400429,2033-03-04 02:01:49,3932,1


In [8]:
merchant_level['quarter'].unique()

array([2, 4, 3, 1], dtype=int64)

In [9]:
# lets make a feature called churn, all those merchants who paid the amount
# with in 2 quarters will be counted as "No Churn", above 2 will be counted as "they will churn"
%timeit
merchant_level['churn'] = merchant_level['quarter'].apply(lambda x: 0 if x<=2 else 1)

In [10]:
merchant_level

Unnamed: 0,merchant,time,amount_usd_in_cents,quarter,churn
0,faa029c6b0,2034-06-17 23:34:14,6349,2,0
1,ed7a7d91aa,2034-12-27 00:40:38,3854,4,1
2,5608f200cf,2034-04-30 01:29:42,789,2,0
3,15b1a0d61e,2034-09-16 01:06:23,4452,3,1
4,4770051790,2034-07-22 16:21:42,20203,3,1
...,...,...,...,...,...
100407,4f65280858,2033-11-17 19:45:58,7490,4,1
100408,d701c0b3db,2034-01-20 15:52:07,93743,1,0
100409,4b683b2bc5,2034-08-21 23:08:25,10472,3,1
100410,fc7f400429,2033-03-04 02:01:49,3932,1,0


In [12]:
count1 = (merchant_level['churn'] == 0).sum()
count2 = (merchant_level['churn'] == 1).sum()
print('Not Churn : ',count1,'\nChurn : ',count2)

Not Churn :  37807 
Churn :  62605


In [16]:
# lets store quarter, and amount feature in X as independent variables
X = merchant_level.drop(['merchant','time','churn'], axis = 1)

In [17]:
# lets store ,churn feature in Y as dependent feature
Y = merchant_level['churn']

In [18]:
#Split the data into test and train set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [19]:
# Importing all the required models
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Logistic Regression
- let train a logistic regression model on the data and see what we get

In [20]:
logistic = LogisticRegression(solver = 'lbfgs',penalty = 'l2')
logistic.fit(X_train,y_train)
logistic.score(X_test,y_test)

0.6218629664055239

In [23]:
predictions = logistic.predict(X_test)

In [24]:
logistic.predict([[300000,1]])



array([0], dtype=int64)

In [25]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.65      0.01      0.02     11457
           1       0.62      1.00      0.77     18667

    accuracy                           0.62     30124
   macro avg       0.64      0.50      0.40     30124
weighted avg       0.63      0.62      0.48     30124



# Random Forest
- Now we are going to train the random forest model

In [26]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(n_estimators=120)
random.fit(X_train, y_train)

RandomForestClassifier(n_estimators=120)

In [27]:
# Accuracy
random.score(X_test, y_test)

1.0

In [29]:
rand = random.predict(X_test)
rand

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [30]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, rand))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11457
           1       1.00      1.00      1.00     18667

    accuracy                           1.00     30124
   macro avg       1.00      1.00      1.00     30124
weighted avg       1.00      1.00      1.00     30124



# Decision Tree


In [31]:
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier(criterion='gini',min_samples_split=20,random_state=42)
decision.fit(X_train,y_train)

DecisionTreeClassifier(min_samples_split=20, random_state=42)

In [33]:
# Accuracy
decision.score(X_test,y_test)


1.0

In [34]:
tree = decision.predict(X_test)
tree

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [48]:
X_test.to_csv('Prediction of churn.csv')

In [49]:
# These are the prediction our model did
X_test.head()

Unnamed: 0,amount_usd_in_cents,quarter,Predicted churn
29729,334,1,0
98016,13798,2,0
25840,24154,3,1
81472,743,3,1
53379,18954,3,1


In [35]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, tree))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11457
           1       1.00      1.00      1.00     18667

    accuracy                           1.00     30124
   macro avg       1.00      1.00      1.00     30124
weighted avg       1.00      1.00      1.00     30124



# 
- we got almost 100% accuracy with random forest and decision tree
- now lets do seperate prediction
- we will tell the model the amount a merchant paid in quarters
- the model will predict either they will churn or nor

In [37]:
# as you can see the quarter is 2, so the model will predict no churn(0)
decision.predict([[15000,2]])



array([0], dtype=int64)

In [39]:
# as you can see the quarter is 3, so the model will predict churn(1)
decision.predict([[10000,3]])



array([1], dtype=int64)