In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import math

## function

In [21]:
def preprocess(row):

    root = "/home/sihartist/Desktop/"
    
    encoder_path = "fraud-detection/preprocessing/dict_all.obj"
    scalerfile = 'fraud-detection/preprocessing/scaler.sav'

    # loading scaler
    min_max_scaler = pickle.load(open(root + scalerfile, 'rb'))

    # loading encoder dictionary
    file = open(root + encoder_path,'rb')
    dict_encoder = pickle.load(file)
    file.close()

    # treating nan values
    row.drop(['V11','V14','V20','V5','V17','V18','V15','CLASS'], inplace=True)

    # updating types
    row['V10'] = float(row['V10'])
    row['V9'] = float(row['V9'])
    row['V24'] = float(row['V24'])

    row['V23'] = pd.to_datetime(row['V23'],format="%d/%m/%Y")
    row['V8'] = pd.to_datetime(row['V8'],format="%d/%m/%Y %H:%M:%S")

    row['V6'] = pd.to_datetime(row['V6'],format="%d/%m/%Y")
    row['V7'] = pd.to_datetime(row['V7'],format="%d/%m/%Y")

    # preprocess dates
    date_cols = ['V6','V7','V23','V8']
    years = {}
    date_rows = {}

    for l in date_cols:
        date_rows[l+'_month'] = row[l].month
        date_rows[l+'_day'] = row[l].day
        years[l+'_year'] = row[l].year
        row.pop(l)

    cos_cols = list(date_rows.keys())
    sin_cols = list(date_rows.keys())


    for i in range(len(cos_cols)):
        cos_cols[i] = 'COS(' + cos_cols[i] + ')'
        sin_cols[i] = 'SIN(' + sin_cols[i] + ')'

    # Normalize date cols
    date_row_scaled = min_max_scaler.transform(np.array(list(date_rows.values())).reshape(-1,8))
    date_row_scaled = date_row_scaled.reshape(8)

    cos_row_scaled = dict(zip(cos_cols, date_row_scaled))
    sin_row_scaled = dict(zip(sin_cols, date_row_scaled))

    date_row_cos = pd.Series(cos_row_scaled)
    date_row_sin = pd.Series(sin_row_scaled)
    date_row_cos = date_row_cos.apply(lambda x: math.cos(x))
    date_row_sin = date_row_sin.apply(lambda x: math.sin(x))

    year_row = pd.Series(years)

    new_row = pd.concat([row, date_row_cos, date_row_sin, year_row], axis=0)

    # encoding
    for col in dict_encoder.keys():
        if(new_row[col] in dict_encoder[col].keys()):
            new_row.replace(dict_encoder[col], inplace=True)
        else:
            new_row[col] = -1

    return new_row

In [3]:
root = '/home/sihartist/Desktop/'
path = root + 'fraud-detection/dataset/final.xlsx'

In [17]:
df = pd.read_excel(path, dtype = str)

In [58]:
row = df.iloc[1]
row['V10'] = 100.0
row['V9'] = 100.0
row['V24'] = 100.0
file = open( root + "fraud-detection/dataset/row.obj","wb")
pickle.dump(row, file)
file.close()

In [54]:
row = pickle.load(open(root + "fraud-detection/dataset/row.obj", "rb"))

In [55]:
type(row)

pandas.core.series.Series

In [56]:
row

V1                                           31
V2                                          281
V3                                           22
V4                                       010112
V5                                          500
V6                                   01/06/2009
V7                                   01/06/2011
V8                          26/06/2009 11:16:47
V9                                          NaN
V10                                         NaN
V11                                         NaN
V12                                      010112
V13                                        6011
V14                                         NaN
V15                                         NaN
V16                                21140121124C
V17                                         NaN
V18                                           6
V19                                      004737
V20                                           0
V21                                    0

In [82]:
new_row = preprocess(row)
new_row

V1                4.000000e+00
V2                2.000000e+00
V3                2.300000e+01
V4                1.000000e+00
V9                1.000000e+02
V10               1.000000e+02
V12              -1.000000e+00
V13               2.976000e+03
V16               4.400000e+01
V19               1.351000e+03
V21               2.000000e+00
V22              -1.000000e+00
V24               1.000000e+02
V25               1.000000e+00
V26               3.840000e+02
V27               6.000000e+00
COS(V6_month)     9.295049e-01
COS(V6_day)      -1.000000e+00
COS(V7_month)     9.295049e-01
COS(V7_day)      -1.000000e+00
COS(V23_month)   -9.795299e-01
COS(V23_day)     -3.473053e-01
COS(V8_month)     9.295049e-01
COS(V8_day)      -3.473053e-01
SIN(V6_month)    -3.688097e-01
SIN(V6_day)       7.657137e-16
SIN(V7_month)    -3.688097e-01
SIN(V7_day)       7.657137e-16
SIN(V23_month)    2.012985e-01
SIN(V23_day)      9.377521e-01
SIN(V8_month)    -3.688097e-01
SIN(V8_day)       9.377521e-01
V6_year 

In [83]:
new_file = open( root + "fraud-detection/dataset/new_row.obj","wb")
pickle.dump(new_row, new_file)
new_file.close()

In [70]:
from pypmml import Model

filename = "fraud-detection/models/svm.pmml"
SVM = Model.load(root + filename)

In [72]:
res = SVM.predict(new_row)

In [77]:
res

predicted_isFraud    1.0
probability          1.0
probability_0        0.0
probability_1        1.0
Name: 0, dtype: float64