In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [3]:
from utils import UtilsKy
from analyzer import AnalyzerPrediction

In [4]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [5]:
# pytorch mlp for binary classification
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset 
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
import joblib

In [6]:
def minMaxScaler_own(teach, test):
    teach = teach.copy()
    test = test.copy()
    col_names = list(teach)
    for col in col_names:
        x_min = teach[col].min(axis=0)
        x_max = teach[col].max(axis=0)
        teach[col] = (teach[col] - x_min) / (x_max - x_min)
        test[col] = (test[col] - x_min) / (x_max - x_min)
        test[col] = np.where(test[col] > 1,  1, test[col])
        test[col] = np.where(test[col] < 0 , 0, test[col])        
    return teach, test

def get_scaler_params(teach):
    col_names = list(teach)
    scaler_params = {}
    for col in col_names:
        x_min = teach[col].min(axis=0)
        x_max = teach[col].max(axis=0)
        scaler_params[col + "_min"] = x_min
        scaler_params[col + "_max"] = x_max
    return scaler_params

In [7]:
db_teach = pd.read_csv( UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')

In [8]:
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')

In [9]:
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [10]:
COL_FACTORS = ['amount', 'bank_currency', 'bin', 'count_months_to_end_card', 'day_of_week', 'is_city_resolved', 'hour',
                                                             'is_gender_undefined', 'latitude', 'longitude', 'phone_2_norm']
COL_FACTORS = sorted(COL_FACTORS)

In [11]:
train = db_teach[COL_FACTORS].apply(pd.to_numeric, errors="coerce").copy()
test = db_test[COL_FACTORS].apply(pd.to_numeric, errors="coerce").copy()

In [12]:
test.head(30)

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
0,158.85,840,510932,19,2,0,1,1,44.1843,-88.5305,20
1,26.48,840,510929,47,2,0,1,0,40.3814,-82.4993,23
2,21.18,840,542418,0,2,0,1,1,42.9960,-85.6314,16
3,55.76,840,542432,35,2,0,1,0,42.0152,-87.9901,47
4,53.16,840,517805,46,2,0,0,1,38.0000,-97.0000,10
...,...,...,...,...,...,...,...,...,...,...,...
25,53.16,840,531260,47,2,0,1,0,26.2968,-81.7896,39
26,106.58,840,551215,32,2,0,1,0,42.4215,-82.8987,48
27,100.54,840,517805,20,2,0,1,1,33.6238,-112.0040,48
28,20.12,840,517805,29,2,0,0,1,38.0000,-97.0000,3


In [13]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
218,3.8,76,535016,57,3,20,0,1,,,19
368,7.13,76,535016,57,3,22,0,1,,,19
393,11.88,76,535016,57,3,22,0,1,,,19
398,71.26,76,544731,43,3,22,0,1,,,19
407,3.56,76,535016,57,3,23,0,1,,,19


In [14]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm
247,76.92,124,543440,4,2,2,0,0,,,13
703,3.77,76,534543,93,2,6,0,1,,,19
1050,3.77,76,534543,93,2,12,0,1,,,19
1088,3.77,76,534543,93,2,13,0,1,,,19
1390,9.42,76,515590,48,2,17,0,1,,,19


In [15]:
test.columns[test.isnull().any(axis=0)]

Index(['latitude', 'longitude'], dtype='object')

In [16]:
lat_replace = train.latitude.mean()
long_replace = train.longitude.mean()

replaced_values = {'latitude': lat_replace, 'longitude': long_replace, 'default': -999}

In [17]:
for col in COL_FACTORS:
    replaced_val = replaced_values.get(col) or replaced_values.get('default')
    print(replaced_val)
    train[col] = train[col].fillna(replaced_val)
    test[col] = test[col].fillna(replaced_val)

-999
-999
-999
-999
-999
-999
-999
-999
36.90237577890762
-92.53325861542274
-999


In [18]:
mask = train.isnull().any(axis=1)
train[mask].head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [19]:
mask = test.isnull().any(axis=1)
test[mask].head()

Unnamed: 0,amount,bank_currency,bin,count_months_to_end_card,day_of_week,hour,is_city_resolved,is_gender_undefined,latitude,longitude,phone_2_norm


In [20]:
scaler_params = {}
scaler ='own'
if scaler =='own' :
    scaler_params = get_scaler_params(train[COL_FACTORS])
    train, test = minMaxScaler_own(train[COL_FACTORS], test[COL_FACTORS])    

In [21]:
scaler_params

{'amount_min': 1.0,
 'amount_max': 1500.0,
 'bank_currency_min': 4,
 'bank_currency_max': 862,
 'bin_min': 510004,
 'bin_max': 559998,
 'count_months_to_end_card_min': 0,
 'count_months_to_end_card_max': 880,
 'day_of_week_min': 1,
 'day_of_week_max': 7,
 'hour_min': 0,
 'hour_max': 23,
 'is_city_resolved_min': 0,
 'is_city_resolved_max': 1,
 'is_gender_undefined_min': 0,
 'is_gender_undefined_max': 1,
 'latitude_min': -43.5333,
 'latitude_max': 71.285,
 'longitude_min': -170.7221,
 'longitude_max': 172.6333,
 'phone_2_norm_min': 0,
 'phone_2_norm_max': 99}

In [22]:
str(list(scaler_params))

"['amount_min', 'amount_max', 'bank_currency_min', 'bank_currency_max', 'bin_min', 'bin_max', 'count_months_to_end_card_min', 'count_months_to_end_card_max', 'day_of_week_min', 'day_of_week_max', 'hour_min', 'hour_max', 'is_city_resolved_min', 'is_city_resolved_max', 'is_gender_undefined_min', 'is_gender_undefined_max', 'latitude_min', 'latitude_max', 'longitude_min', 'longitude_max', 'phone_2_norm_min', 'phone_2_norm_max']"

In [23]:
train['status'] = db_teach.status.astype(np.int).copy()

In [24]:
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, df): # path
        # store the inputs and outputs
        self.X = df.values[:, :-1]
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        # label encode target and ensure the values are floats
        self.y = LabelEncoder().fit_transform(self.y)
        self.y = self.y.astype('float32')
        self.y = self.y.reshape((len(self.y), 1))
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

In [25]:
# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 10)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(10, 6)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(6, 1)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Sigmoid()
 
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        X = self.act3(X)
        return X

In [26]:
# prepare the dataset
def prepare_data(df):
    # load the dataset
    dataset = CSVDataset(df)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl

In [27]:
def train_model(train_dl, model, optimizer=None, n_epoch=100, lr=0.01, momentum=0.9):
    criterion = BCELoss()
    if optimizer == None:
        optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)   
    for epoch in range(n_epoch):
        for i, (inputs, targets) in enumerate(train_dl):
            optimizer.zero_grad()
            yhat = model(inputs)
            loss = criterion(yhat, targets)
            loss.backward()
            optimizer.step()

In [28]:
def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

In [29]:
def get_db_test_prediction(db_test, model, numeric_cols):
    predictions = list()
    test = db_test[numeric_cols].values
    for row in test:
        predictions.append(predict(row, model)[0][0])
    return predictions         

In [30]:
train_dl, _ = prepare_data(train)

In [31]:
n_features = len(COL_FACTORS)
model = MLP(n_features)

In [32]:
model.__class__

__main__.MLP

In [33]:
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)

In [34]:
result_df_amount = None

In [35]:
n_epoch = 30
lr = 0.01 
momentum = 0.9

for n_epoch in [30]:
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    train_model(train_dl, model, optimizer, n_epoch=n_epoch, lr=lr, momentum=momentum)
    
    test_probability = get_db_test_prediction(test, model, COL_FACTORS)
    db_test["probability"] = test_probability
        
    description = '-' . join([str(elem) for elem in (n_epoch, momentum, lr)])       
    result_df_amount = analyzer_prediction.get_table_prediction(description=description, result_df=result_df_amount, metric="amount")
    print( 'n_epoch={}'. format(n_epoch ))


n_epoch=30


In [36]:
n = result_df_amount.shape[0]
sub_rows = list(range(n))[::2]
stat_best = result_df_amount.copy().iloc[sub_rows,:]

col_names = [col for col in stat_best.columns if col.startswith('p_') ] 
stat_best.loc[:, col_names] = stat_best.loc[:, col_names].astype(float)
stat_best = stat_best.sort_values(by="rating", ascending=False)

In [37]:
stat_best.iloc[:,:15]

Unnamed: 0,description,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_10,p_20,rating,n_white_list,n_test_in_wl,n_test_bad_in_wl,amount_test_in_wl
0,30-0.9-0.01,5.13,11.15,16.13,18.97,20.18,22.07,23.61,28.78,51.73,152.07,1019125,22992,34,1640236.53
