In [35]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler # Normalization - Standardization
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline

In [36]:
# Define the path of the data set
df = pd.read_csv('../processed_data/training_data.csv')
test1 = pd.read_csv('../processed_data/testing_data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15544 entries, 0 to 15543
Data columns (total 57 columns):
Unnamed: 0                                  15544 non-null int64
Australia                                   15544 non-null int64
Central_Rich_Europe                         15544 non-null int64
East_Europe_Balkans                         15544 non-null int64
Mediteranian_Europe                         15544 non-null int64
North_Europe_Scand                          15544 non-null int64
user_id                                     15544 non-null object
birth_year                                  15544 non-null int64
country                                     15544 non-null object
city                                        15544 non-null object
created_date                                15544 non-null object
user_settings_crypto_unlocked               15544 non-null int64
plan                                        15544 non-null int64
attributes_notifications_marketing_push   

In [37]:
features = ['user_settings_crypto_unlocked', 'num_contacts', 'brand', 'age', 'total_amount', 'number_transactions','avg_amount',
            'transaction_period', 'user_active_time', 'user_trans_time', 'user_trans_periodicity', 'recency',
            'ATM', 'CARD_PAYMENT', 'CARD_REFUND', 'EXCHANGE', 'REFUND', 'TAX', 'TOPUP', 'TRANSFER',
           'BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY',
          'LOST_CARD_ORDER', 'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER',
          'NO_INITIAL_CARD_USE', 'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER', 'REENGAGEMENT_ACTIVE_FUNDS',
          'WELCOME_BACK',
           'Australia', 'Central_Rich_Europe', 'East_Europe_Balkans', 'Mediteranian_Europe', 'North_Europe_Scand',
           'CHF', 'EUR', 'GBP', 'OTHER']
            
types = ['ATM', 'CARD_PAYMENT', 'CARD_REFUND', 'EXCHANGE', 'REFUND', 'TAX', 'TOPUP', 'TRANSFER']
status = ['BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY',
          'LOST_CARD_ORDER', 'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER',
          'NO_INITIAL_CARD_USE', 'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER', 'REENGAGEMENT_ACTIVE_FUNDS',
          'WELCOME_BACK'] 

countries = ['Australia', 'Central_Rich_Europe', 'East_Europe_Balkans', 'Mediteranian_Europe', 'North_Europe_Scand']

currency = ['CHF', 'EUR', 'GBP', 'OTHER']
            
class_y = ['plan']

In [38]:
test = test1[features].copy()
data = df[features].copy() # features
y = df[class_y] # labels

# Data splitting & Upsampling

In [39]:
# Split the data
x = data[features].copy() # features
y = df[class_y].copy() # labels

# x_train1 = x.copy()
# y_train1 = y.copy()
# x_test = test.copy()

x_train1, x_test, y_train1, y_test = train_test_split(x, y, test_size=0.2)
# print(x_test, y_test)


In [40]:
# upsampling
df1 = x_train1.copy()
df1['plan'] = y_train1.copy()
df1_minor = df1[df1.plan==1]
df1_major = df1[df1.plan==0]

smpl = len(df1_major) - len(df1_minor)
print(len(df1_major))
print(len(df1_minor))
print(smpl)

# X = data.drop('y', axis=1).values
# y = data['y'].values

# bool val to change between separated and non-separated upsampling
case = True

if case:
    smote = SMOTE()
    x_train, y_train = smote.fit_resample(x_train1, y_train1)

#     x_train = pd.concat( [x_train, y_train], axis=1)
#     x_train1 = pd.concat( [x_train1, y_train1], axis=1)
else:
    smoteenn = SMOTEENN()
    # X_us_nn, y_us_nn = enn.fit_resample(x_train1, y_train1)
    x_train, y_train = smoteenn.fit_resample(x_train1, y_train1)

#     x_train = pd.concat( [x_train, y_train], axis=1)
#     x_train1 = pd.concat( [x_train1, y_train1], axis=1)
    
# print('Dataset size before oversampling:', len(x_train1[x_train1.plan==1]), len(x_train1[x_train1.plan==0]))
# print('Dataset size after oversampling: ', len(x_train[x_train.plan==1]), len(x_train[x_train.plan==0]))
# x_train.head(5)

11518
917
10601


# Normalization - Standardization

In [41]:
# standardizing-normalizing the training-data in order to fit the models
# thats the reason we use x_train
S_scaler = StandardScaler() # MinMaxScaler()
N_scaler = MinMaxScaler()
N_features = features
# S_scaler.fit(data[['num_contacts']])
# data[['num_contacts']] = S_scaler.transform(data[['num_contacts']])

S_scaler.fit_transform(x_train)
S_scaler.transform(x_test)
# data[N_features] = N_scaler.transform(data[N_features])
# n_values = preprocessing.normalize(df[N_features])
# data[N_features] = n_values
# print(n_values)
# data.head(5)


array([[-0.54108906, -0.39938813,  1.16422473, ..., -0.45556553,
        -0.36796872, -0.34504793],
       [-0.54108906, -0.39938813,  1.16422473, ..., -0.50574063,
        -0.36796872, -0.26902615],
       [-0.54108906, -0.37613425,  1.16422473, ..., -0.50574063,
        -0.36796872, -0.351959  ],
       ...,
       [ 1.84812459, -0.14359544, -0.8589407 , ..., -0.1433649 ,
        -0.33782943, -0.31049258],
       [-0.54108906,  0.2982283 ,  1.16422473, ..., -0.32734027,
        -0.36796872, -0.33122579],
       [-0.54108906, -0.39938813, -0.8589407 , ..., -0.50574063,
        -0.36796872, -0.17918222]])

# Logistic Regression

In [42]:
# Logistic Regression
log_reg = LogisticRegression(penalty = 'none', C=0.001, solver = 'newton-cg') 

# cross val
scores = (cross_val_score(log_reg, x_train, y_train, cv=5))
print(scores)

# 2.
log_reg.fit(x_train, y_train)

# 3.
lr_pred = log_reg.predict(x_test)
print(accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)


[0.921875   0.96375081 0.96635555 0.96353375 0.96071196]
0.9636539080090061
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2876
           1       0.77      0.74      0.75       233

    accuracy                           0.96      3109
   macro avg       0.87      0.86      0.87      3109
weighted avg       0.96      0.96      0.96      3109





# Decision Tree

In [43]:
# Grid search for Decision Trees

grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 100, 50, 20, 10, 5]
}

dt = DecisionTreeClassifier()

gs = GridSearchCV(dt, grid, cv=2, scoring='accuracy', verbose=1) # scoring accuracy

gs.fit(x_train, y_train)

print(gs.best_params_) # these are the best parameters for my trainning set

dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = dt_best.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    5.0s finished


{'splitter': 'best', 'criterion': 'entropy', 'max_depth': 20}
0.9022193631392731
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2876
           1       0.37      0.45      0.41       233

    accuracy                           0.90      3109
   macro avg       0.66      0.69      0.68      3109
weighted avg       0.91      0.90      0.91      3109



# K-NN

In [44]:
grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [3, 5],
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}

knn = KNeighborsClassifier(weights= 'distance', algorithm= 'auto', n_neighbors= 3)

# gs = GridSearchCV(knn, grid, cv=5, scoring='f1', verbose=1) # scoring accuracy

knn.fit(x_train, y_train)

# print(gs.best_params_) # these are the best parameters for my trainning set

#dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = knn.predict(x_test) # dt_best.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

  # This is added back by InteractiveShellApp.init_path()


0.7992923769700868
              precision    recall  f1-score   support

           0       0.95      0.83      0.88      2876
           1       0.18      0.45      0.25       233

    accuracy                           0.80      3109
   macro avg       0.56      0.64      0.57      3109
weighted avg       0.89      0.80      0.84      3109



# Support Vector Machine

In [45]:
# Grid search for SVM

grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'], # 
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}


svm = SVC(gamma= 'scale', kernel= 'rbf', class_weight= None) # 

gs = GridSearchCV(svm, grid, cv=5, scoring='f1', verbose=1) # scoring accuracy

svm.fit(x_train, y_train)
#print(svm.coef_)
#print(features)

# print(gs.best_params_) # these are the best parameters for my trainning set

# dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = svm.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

  y = column_or_1d(y, warn=True)


0.8092634287552267
              precision    recall  f1-score   support

           0       0.97      0.82      0.89      2876
           1       0.23      0.65      0.34       233

    accuracy                           0.81      3109
   macro avg       0.60      0.74      0.61      3109
weighted avg       0.91      0.81      0.85      3109



# Random Forests

In [46]:
def rforest():

    var = VarianceThreshold()

    rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    rf.fit(x_train, y_train)
    rf.feature_importances_

    for c, imp in sorted(zip(features, rf.feature_importances_), key=lambda pair: pair[1], reverse=True):
      print('{:20}: {}'.format(c, imp))

    rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')

#     pipe1 = Pipeline([('scaler', S_scaler),
#                       ('selector', var),
#                       ('model', rf)])

#     grid1 = {'selector__threshold': [0, 0.2, 0.4],
#              'model__n_estimators': [20, 50, 100]}
    
    
    # threshhold = 0.2, estimator = 100
    # clf = GridSearchCV(pipe1, grid1)
    rf.fit(x_train, y_train)

    # print(rf.best_params_, '\n')
    
    print(classification_report(y_test, rf.predict(x_test)))

In [47]:
# calling random forest
rforest()

  


avg_amount          : 0.07796607145189703
NO_INITIAL_CARD_USE : 0.06567497767983208
EXCHANGE            : 0.056873934392978914
total_amount        : 0.05604934522765119
number_transactions : 0.05602192562955115
INVEST_IN_GOLD      : 0.05153267218180202
recency             : 0.04805941275404258
ATM                 : 0.046334933594220966
REFUND              : 0.03999498864993613
ONBOARDING_TIPS_ACTIVATED_USERS: 0.0387371747175758
BLACK_FRIDAY        : 0.03328763709727322
NO_INITIAL_CARD_ORDER: 0.032854878223601634
CARD_REFUND         : 0.032591255493380177
CARD_PAYMENT        : 0.02831606344163533
transaction_period  : 0.024681380655984953
TRANSFER            : 0.02245807425020543
REENGAGEMENT_ACTIVE_FUNDS: 0.022365291271397792
GBP                 : 0.020868822246297516
OTHER               : 0.02068017132798836
PROMO               : 0.019687574841133086
user_trans_time     : 0.018243481740660573
East_Europe_Balkans : 0.016107339288859095
user_active_time    : 0.015979099065683493
TOPUP  



              precision    recall  f1-score   support

           0       0.95      0.99      0.97      2876
           1       0.68      0.33      0.44       233

    accuracy                           0.94      3109
   macro avg       0.81      0.66      0.71      3109
weighted avg       0.93      0.94      0.93      3109



# MLP - Neural Network

In [54]:
def NN():
    grid={'activation':['logistic'], # ,'tanh', 'relu'],
          'learning_rate_init':[0.0001], # 0.001, 0.01, 0.1],
          'momentum':[0.5], # ,0.2,0.3,0.7,0.9],
          'hidden_layer_sizes': (300,)
         }

    mlp=MLPClassifier() # activation = 'logistic', momentum = 0.5, learning_rate_init = 0.0001
    gs=GridSearchCV(mlp, grid, cv=2, scoring = 'accuracy')

    gs_result = gs.fit(x_train,y_train)
    print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))
    
    # activation = relu, learning rate = 0.01, momentum = 0.9
    # Best: 0.937278 using {'learning_rate_init': 0.0001, 'momentum': 0.5, 'hidden_layer_sizes': 300, 'activation': 'logistic'}
    mlp_pred = gs.predict(x_test)
    
    print(accuracy_score(y_test,mlp_pred))
    print(classification_report(y_test,mlp_pred))
    return mlp_pred

In [55]:
# calling neural networks
mlp_pred = NN()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best: 0.949036 using {'learning_rate_init': 0.0001, 'momentum': 0.5, 'hidden_layer_sizes': 300, 'activation': 'logistic'}
0.9575426182052107
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2876
           1       0.73      0.68      0.71       233

    accuracy                           0.96      3109
   macro avg       0.85      0.83      0.84      3109
weighted avg       0.96      0.96      0.96      3109



              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2876
           1       0.77      0.74      0.75       233

    accuracy                           0.96      3109
   macro avg       0.87      0.86      0.87      3109
weighted avg       0.96      0.96      0.96      3109
