In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler # Normalization - Standardization
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline

In [2]:
# Define the path of the data set
df = pd.read_csv('../processed_data/training_data.csv')
test1 = pd.read_csv('../processed_data/testing_data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15544 entries, 0 to 15543
Data columns (total 56 columns):
Unnamed: 0                                  15544 non-null int64
Australia                                   15544 non-null int64
Central_Rich_Europe                         15544 non-null int64
East_Europe_Balkans                         15544 non-null int64
Mediteranian_Europe                         15544 non-null int64
North_Europe_Scand                          15544 non-null int64
user_id                                     15544 non-null object
birth_year                                  15544 non-null int64
country                                     15544 non-null object
city                                        15544 non-null object
created_date                                15544 non-null object
user_settings_crypto_unlocked               15544 non-null int64
plan                                        15544 non-null int64
attributes_notifications_marketing_push   

In [3]:
features = ['user_settings_crypto_unlocked', 'num_contacts', 'brand', 'age', 'total_amount', 'number_transactions','avg_amount',
            'transaction_period', 'user_active_time', 'user_trans_time', 'user_trans_periodicity','ATM', 'CARD_PAYMENT', 'CARD_REFUND', 'EXCHANGE', 'REFUND', 'TAX', 'TOPUP', 
            'TRANSFER',
           'BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY',
          'LOST_CARD_ORDER', 'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER',
          'NO_INITIAL_CARD_USE', 'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER', 'REENGAGEMENT_ACTIVE_FUNDS',
          'WELCOME_BACK',
           'Australia', 'Central_Rich_Europe', 'East_Europe_Balkans', 'Mediteranian_Europe', 'North_Europe_Scand',
           'CHF', 'EUR', 'GBP', 'OTHER']
            
types = ['ATM', 'CARD_PAYMENT', 'CARD_REFUND', 'EXCHANGE', 'REFUND', 'TAX', 'TOPUP', 'TRANSFER']
status = ['BLACK_FRIDAY', 'BLUE_TUESDAY', 'ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'INVEST_IN_GOLD', 'JOINING_ANNIVERSARY',
          'LOST_CARD_ORDER', 'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER',
          'NO_INITIAL_CARD_USE', 'ONBOARDING_TIPS_ACTIVATED_USERS', 'PROMO', 'PROMO_CARD_ORDER', 'REENGAGEMENT_ACTIVE_FUNDS',
          'WELCOME_BACK'] 

countries = ['Australia', 'Central_Rich_Europe', 'East_Europe_Balkans', 'Mediteranian_Europe', 'North_Europe_Scand']

currency = ['CHF', 'EUR', 'GBP', 'OTHER']
            
class_y = ['plan']

In [4]:
test = test1[features].copy()
data = df[features].copy() # features
y = df[class_y] # labels

# Data splitting & Upsampling

In [5]:
# Split the data
x = data[features].copy() # features
y = df[class_y].copy() # labels

# x_train1 = x.copy()
# y_train1 = y.copy()
# x_test = test.copy()

x_train1, x_test, y_train1, y_test = train_test_split(x, y, test_size=0.2)
# print(x_test, y_test)


In [6]:
# upsampling
df1 = x_train1.copy()
df1['plan'] = y_train1.copy()
df1_minor = df1[df1.plan==1]
df1_major = df1[df1.plan==0]

smpl = len(df1_major) - len(df1_minor)
print(len(df1_major))
print(len(df1_minor))
print(smpl)

# X = data.drop('y', axis=1).values
# y = data['y'].values

# bool val to change between separated and non-separated upsampling
case = True

if case:
    smote = SMOTE()
    x_train, y_train = smote.fit_resample(x_train1, y_train1)

#     x_train = pd.concat( [x_train, y_train], axis=1)
#     x_train1 = pd.concat( [x_train1, y_train1], axis=1)
else:
    smoteenn = SMOTEENN()
    # X_us_nn, y_us_nn = enn.fit_resample(x_train1, y_train1)
    x_train, y_train = smoteenn.fit_resample(x_train1, y_train1)

#     x_train = pd.concat( [x_train, y_train], axis=1)
#     x_train1 = pd.concat( [x_train1, y_train1], axis=1)
    
# print('Dataset size before oversampling:', len(x_train1[x_train1.plan==1]), len(x_train1[x_train1.plan==0]))
# print('Dataset size after oversampling: ', len(x_train[x_train.plan==1]), len(x_train[x_train.plan==0]))
# x_train.head(5)

11503
932
10571


# Normalization - Standardization

In [7]:
# standardizing-normalizing the training-data in order to fit the models
# thats the reason we use x_train
S_scaler = StandardScaler() # MinMaxScaler()
N_scaler = MinMaxScaler()
N_features = features
# S_scaler.fit(data[['num_contacts']])
# data[['num_contacts']] = S_scaler.transform(data[['num_contacts']])

S_scaler.fit_transform(x_train)
S_scaler.transform(x_test)
# data[N_features] = N_scaler.transform(data[N_features])
# n_values = preprocessing.normalize(df[N_features])
# data[N_features] = n_values
# print(n_values)
# data.head(5)


array([[-0.52790095, -0.49051709,  1.11605699, ..., -0.4903752 ,
        -0.37522331, -0.35361323],
       [-0.52790095, -0.49051709,  1.11605699, ..., -0.47960344,
        -0.33580603, -0.35361323],
       [-0.52790095, -0.49051709, -0.89601159, ..., -0.47960344,
        -0.37522331, -0.35361323],
       ...,
       [-0.52790095, -0.09939668, -0.89601159, ..., -0.50114696,
        -0.22740852, -0.35361323],
       [-0.52790095, -0.05338251,  1.11605699, ..., -0.50114696,
         0.76787771,  1.3778752 ],
       [-0.52790095, -0.42149584, -0.89601159, ..., -0.06489057,
        -0.18306409, -0.35361323]])

# Logistic Regression

In [8]:
# Logistic Regression
log_reg = LogisticRegression(penalty = 'none', C=0.001, solver = 'newton-cg')

# cross val
scores = (cross_val_score(log_reg, x_train, y_train, cv=5))
print(scores)

# 2.
log_reg.fit(x_train, y_train)

# 3.
lr_pred = log_reg.predict(x_test)
print(accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)
  "Setting penalty='none' will ignore the C and l1_ratio "
  y = column_or_1d(y, warn=True)


[0.92698827 0.96587698 0.96326885 0.96196479 0.9654423 ]
0.9585075587005468
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2891
           1       0.69      0.74      0.72       218

    accuracy                           0.96      3109
   macro avg       0.83      0.86      0.85      3109
weighted avg       0.96      0.96      0.96      3109





# Decision Tree

In [None]:
# Grid search for Decision Trees

grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 100, 50, 20, 10, 5]
}

dt = DecisionTreeClassifier()

gs = GridSearchCV(dt, grid, cv=2, scoring='accuracy', verbose=1) # scoring accuracy

gs.fit(x_train, y_train)

print(gs.best_params_) # these are the best parameters for my trainning set

dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = dt_best.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 24 candidates, totalling 48 fits


# K-NN

In [None]:
grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [3, 5],
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}

knn = KNeighborsClassifier(weights= 'distance', algorithm= 'auto', n_neighbors= 3)

# gs = GridSearchCV(knn, grid, cv=5, scoring='f1', verbose=1) # scoring accuracy

knn.fit(x_train, y_train)

# print(gs.best_params_) # these are the best parameters for my trainning set

#dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = knn.predict(x_test) # dt_best.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

# Support Vector Machine

In [None]:
# Grid search for SVM

grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'], # 
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}


svm = SVC(gamma= 'scale', kernel= 'rbf', class_weight= None) # 

gs = GridSearchCV(svm, grid, cv=5, scoring='f1', verbose=1) # scoring accuracy

svm.fit(x_train, y_train)
#print(svm.coef_)
#print(features)

# print(gs.best_params_) # these are the best parameters for my trainning set

# dt_best = gs.best_estimator_ # getting the best estimator

dt_best_preds = svm.predict(x_test) # prediction from the best estimator

print(accuracy_score(y_test, dt_best_preds)) # calculating accuracy score
print(classification_report(y_test, dt_best_preds))

# Random Forests

In [None]:
def rforest():

    var = VarianceThreshold()

    rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    rf.fit(x_train, y_train)
    rf.feature_importances_

    for c, imp in sorted(zip(features, rf.feature_importances_), key=lambda pair: pair[1], reverse=True):
      print('{:20}: {}'.format(c, imp))

    rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')

#     pipe1 = Pipeline([('scaler', S_scaler),
#                       ('selector', var),
#                       ('model', rf)])

#     grid1 = {'selector__threshold': [0, 0.2, 0.4],
#              'model__n_estimators': [20, 50, 100]}
    
    
    # threshhold = 0.2, estimator = 100
    # clf = GridSearchCV(pipe1, grid1)
    rf.fit(x_train, y_train)

    # print(rf.best_params_, '\n')
    
    print(classification_report(y_test, rf.predict(x_test)))

In [None]:
# calling random forest
rforest()

# MLP - Neural Network

In [None]:
def NN():
    grid={'activation':['logistic'], # ,'tanh', 'relu'],
          'learning_rate_init':[0.0001], # 0.001, 0.01, 0.1],
          'momentum':[0.5] # ,0.2,0.3,0.7,0.9]
         }

    mlp=MLPClassifier() # activation = 'logistic', momentum = 0.5, learning_rate_init = 0.0001
    gs=GridSearchCV(mlp,grid,cv=3,scoring='accuracy')

    gs_result = gs.fit(x_train,y_train)
    print("Best: %f using %s" % (gs_result.best_score_, gs_result.best_params_))
    
    # activation = relu, learning rate = 0.01, momentum = 0.9
    # Best: 0.937278 using {'activation': 'logistic', 'momentum': 0.5, 'learning_rate_init': 0.0001}
    mlp_pred = gs.predict(x_test)
    
    print(accuracy_score(y_test,mlp_pred))
    print(classification_report(y_test,mlp_pred))
    return mlp_pred

In [None]:
# calling neural networks
mlp_pred = NN()

In [None]:
results = pd.DataFrame({'user_id': test1['user_id'], 'plan': mlp_pred})
results.plan.value_counts()
