In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

import sys
sys.path.append('..')
from utils.ScoreFunction import score_function, other_score_function
from utils.CustomClassifiers import CustomModelWithThreshold
random_seed = 42
np.random.seed(random_seed)
pd.options.mode.chained_assignment = None

selected_features = [
    "trustLevel",
    "trustLevel quantityModifications",
    "trustLevel withoutRegisPerPosition",
    "totalScanTimeInSeconds quantityModifications",
    "lineItemVoids^2",
    "lineItemVoids valuePerSecond",
    "lineItemVoids totalScanned",
    "scansWithoutRegistration avgValuePerScan",
    "valuePerSecond avgValuePerScan withoutRegisPerPosition",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm^2",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm totalScanned",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm quantityModsPerPosition"
]

def dmc_profit(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    TP = sum((y_true == 1) & (y_pred == 1))
    #TN = sum((cvres['true'] == 0) & (cvres['cvpredict'] == 0))
    FP = sum((y_true == 0) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    profit = 5*TP -25*FP -5*FN
    return profit

## Импорт данных

In [2]:
data = pd.read_csv("../data/train.csv",sep='|')
X_test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

THRESHOLDS = {'scannedLineItemsPerSecond':1, 'valuePerSecond':1, 'lineItemVoidsPerPosition':1}

for x in list(THRESHOLDS.keys()):
    X[x] = X[x].clip(lower=X[x].quantile(0.01),upper=X[x].quantile(0.99))
    X_test[x] = X_test[x].clip(lower=X_test[x].quantile(0.01),upper=X_test[x].quantile(0.99))
    
    
X['totalScanTimeInSecondsStdNorm'] = (X['totalScanTimeInSeconds'] - X['totalScanTimeInSeconds'].mean())/X['totalScanTimeInSeconds'].std()

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

X_test['totalScanTimeInSecondsStdNorm'] = (X_test['totalScanTimeInSeconds'] - X_test['totalScanTimeInSeconds'].mean())/X_test['totalScanTimeInSeconds'].std()

X_test['totalScanned'] = X_test['scannedLineItemsPerSecond']*X_test['totalScanTimeInSeconds']
X_test['avgTimePerScan'] = 1/X_test['scannedLineItemsPerSecond']
X_test['avgValuePerScan'] = X_test['avgTimePerScan']*X_test['valuePerSecond']
X_test['withoutRegisPerPosition'] = X_test['scansWithoutRegistration']*X_test['totalScanned']
X_test['quantityModsPerPosition'] = X_test['quantityModifications']/X_test['totalScanned']

cols = list(X.columns)+[1,2]

### Polynom features

In [3]:
# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
polyFeatures.fit(X.append(X_test, ignore_index = True))

X_poly = polyFeatures.transform(X)
X_test_poly = polyFeatures.transform(X_test)

# remove the first var because it is the constant term
X_poly = X_poly[:,1:]
X_test_poly = X_test_poly[:,1:]

features = polyFeatures.get_feature_names(input_features=X.columns)[1:]

In [4]:
X_poly = pd.DataFrame(X_poly, columns=features)
X_test_poly = pd.DataFrame(X_test_poly, columns=features)

X_tmp = X.copy()
X_test_tmp = X_test.copy()

for f in selected_features:
    X_tmp = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
    X_test_tmp = pd.concat([X_test_tmp,pd.Series(X_test_poly[f])], axis=1)

### Процесс отбора полиномиальных фичей

In [5]:
def fast_evaluate(X,y):
    X = np.array(X)
    y = np.array(y)
    score_results = []
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf_t = CustomModelWithThreshold(clf,0.7)
        clf_t.fit(X[train_index], y[train_index])
        y_pred = clf_t.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
    return sum(score_results)

In [6]:
"""score = -10000
last_score = -10000
selected_features = []

for f in features:
    X_check = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
    score = fast_evaluate(X_check, y)
   
    if score > last_score:
        X_tmp = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
        selected_features.append(f)
        last_score = score    
        print(last_score,f)"""

'score = -10000\nlast_score = -10000\nselected_features = []\n\nfor f in features:\n    X_check = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)\n    score = fast_evaluate(X_check, y)\n   \n    if score > last_score:\n        X_tmp = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)\n        selected_features.append(f)\n        last_score = score    \n        print(last_score,f)'

In [7]:
X_tmp.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScanTimeInSecondsStdNorm,...,trustLevel withoutRegisPerPosition,totalScanTimeInSeconds quantityModifications,lineItemVoids^2,lineItemVoids valuePerSecond,lineItemVoids totalScanned,scansWithoutRegistration avgValuePerScan,valuePerSecond avgValuePerScan withoutRegisPerPosition,lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm^2,lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm totalScanned,lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm quantityModsPerPosition
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0.229837,...,0.0,3162.0,49.0,0.363283,203.0,0.0,0.0,0.012751,1.608857,0.005739
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,-1.554582,...,84.0,432.0,25.0,1.266667,70.0,3.908571,13.8624,0.863116,-7.77291,-0.158631
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,1.101297,...,390.0,7580.0,9.0,0.123008,39.0,47.815385,25.48724,0.27989,3.303891,0.097748
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,1.620023,...,696.0,7164.0,64.0,0.412328,232.0,12.732414,19.031013,0.723993,12.960187,0.061642
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,-0.947201,...,945.0,860.0,9.0,0.568814,81.0,21.137407,108.20927,0.099688,-2.841602,-0.007796


In [8]:
X = np.array(X_tmp)
y = np.array(y)

In [9]:
def profit_scorer(y, y_pred):
        profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
        return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

## Создание и обучение лог регрессии, валидация

In [10]:
nfolds = [3,5,7,10]

In [40]:
score_results, score2_results,score3_results, f1_results, recall_results = [], [], [], [], []

In [41]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf_t = CustomModelWithThreshold(clf,0.68)
        clf_t.fit(X[train_index], y[train_index])
        y_pred = clf_t.predict(X[test_index]).astype(int)
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        score2_results.append(dmc_profit(y[test_index], y_pred))
        score3_results.append(profit_scorer(y[test_index], y_pred))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [42]:
(np.array(score_results).mean(),np.array(score2_results).mean(),np.array(score3_results).mean(),np.array(f1_results).mean(),np.array(recall_results).mean())

(50.8, 50.8, 50.8, 0.9417598028887869, 0.931230455818691)

### Результаты бейзлайна следующие (кросс-валидация на 3,5,7,10 фолдах):
### - Наш скор: 41 евро
### - Не наш скор: 29 евро
### - F1: 0.919
### - recall: 0.902

### Чтобы модель прошла проверку необходимо побить ВСЕ эти параметры

## Тестовый предикт

In [24]:
X_test_ = np.array(X_test_tmp)

In [25]:
clf = LogisticRegression(random_state=random_seed)
clf_t = CustomModelWithThreshold(clf,0.68)
clf_t.fit(X, y)
y_pred = clf_t.predict(X_test_).astype(int)

In [26]:
y_pred_kernel = pd.read_csv("../data/kernel-prediction.csv",header=None)

In [27]:
y_pred_kernel = np.array(y_pred_kernel[0])

In [28]:
y_pred_kernel.shape

(498121,)

In [29]:
y_pred.shape

(498121,)

In [30]:
not_equal_preds = []

for i in range(y_pred.shape[0]):
    if y_pred[i] != y_pred_kernel[i]:
        not_equal_preds.append((i, y_pred[i],y_pred_kernel[i]))
        print(i, y_pred[i],y_pred_kernel[i])

269 1 0
289 0 1
502 0 1
517 1 0
525 1 0
623 0 1
960 0 1
997 0 1
1018 0 1
1082 0 1
1107 1 0
1108 1 0
1187 0 1
1390 0 1
1498 1 0
1608 0 1
1609 1 0
1719 0 1
1729 1 0
1810 0 1
2098 0 1
2180 0 1
2284 0 1
2296 1 0
2302 0 1
2318 0 1
2421 0 1
2663 0 1
2680 0 1
2690 0 1
2704 0 1
2714 0 1
2820 0 1
3089 0 1
3107 0 1
3209 1 0
3227 1 0
3272 0 1
3324 0 1
3473 0 1
4089 1 0
4113 1 0
4199 1 0
4201 0 1
4229 0 1
4490 0 1
4603 0 1
4784 0 1
4897 1 0
4912 0 1
4990 0 1
5021 0 1
5042 1 0
5149 0 1
5192 0 1
5226 0 1
5250 1 0
5252 0 1
5340 1 0
5420 1 0
5480 0 1
5636 1 0
5702 0 1
5707 0 1
5796 0 1
5877 0 1
5942 0 1
5948 0 1
5965 0 1
5982 0 1
6088 1 0
6190 0 1
6529 1 0
6911 0 1
6946 0 1
7037 0 1
7113 1 0
7136 0 1
7148 1 0
7334 0 1
7361 0 1
7499 0 1
7521 0 1
7556 1 0
7559 1 0
7579 0 1
7630 0 1
7754 0 1
7767 0 1
7853 0 1
7921 0 1
7974 1 0
8149 0 1
8156 1 0
8206 0 1
8239 1 0
8288 0 1
8370 0 1
8395 1 0
8425 0 1
8667 1 0
8722 0 1
8771 1 0
8841 0 1
8853 0 1
8856 0 1
8964 1 0
9025 1 0
9117 0 1
9208 0 1
9230 0 1
9269 0 1


85505 1 0
85621 1 0
85707 0 1
85728 0 1
85942 0 1
86081 0 1
86203 0 1
86338 1 0
86559 0 1
86626 0 1
86675 0 1
86690 1 0
86706 0 1
86782 0 1
86828 1 0
86846 1 0
86931 0 1
86935 1 0
86938 1 0
87011 0 1
87040 0 1
87157 0 1
87212 0 1
87250 0 1
87526 0 1
87615 1 0
87721 0 1
87801 0 1
87835 1 0
88251 0 1
88255 1 0
88261 1 0
88295 0 1
88425 1 0
88459 1 0
88536 0 1
88559 0 1
88730 0 1
88866 1 0
88922 1 0
88994 0 1
89004 0 1
89246 1 0
89261 0 1
89400 1 0
89440 0 1
89456 0 1
89470 0 1
89774 0 1
89820 1 0
89848 1 0
89892 1 0
89929 0 1
89939 0 1
89998 1 0
90012 1 0
90015 0 1
90131 0 1
90201 0 1
90456 0 1
90582 0 1
90599 1 0
90651 0 1
90854 0 1
90890 1 0
90948 0 1
91036 0 1
91089 0 1
91244 0 1
91315 0 1
91535 1 0
91584 1 0
91686 1 0
91943 0 1
91985 0 1
92007 0 1
92189 0 1
92215 0 1
92260 1 0
92287 0 1
92332 0 1
92353 0 1
92428 0 1
92483 0 1
92537 0 1
92569 1 0
92687 1 0
92734 0 1
92836 0 1
92952 0 1
93373 0 1
93432 1 0
93446 1 0
93495 0 1
93621 0 1
93649 1 0
93827 1 0
93859 0 1
93900 0 1
93949 0 1


175962 0 1
176020 1 0
176174 0 1
176183 0 1
176267 0 1
176323 0 1
176422 1 0
176578 0 1
176936 0 1
176938 0 1
177052 1 0
177093 0 1
177129 0 1
177177 0 1
177287 0 1
177380 0 1
177422 0 1
177456 0 1
177463 0 1
177492 0 1
177576 0 1
177685 0 1
177753 1 0
177775 0 1
177835 0 1
177957 1 0
177999 0 1
178013 0 1
178091 1 0
178140 1 0
178276 0 1
178375 1 0
178491 0 1
178513 0 1
178894 0 1
178996 0 1
179087 1 0
179091 1 0
179168 1 0
179176 0 1
179281 0 1
179387 0 1
179547 1 0
179563 0 1
179603 1 0
179765 1 0
179831 0 1
179836 1 0
179874 1 0
180032 1 0
180106 1 0
180153 0 1
180213 0 1
180248 1 0
180266 1 0
180283 0 1
180295 0 1
180351 0 1
180459 0 1
180552 1 0
180563 0 1
180725 0 1
180825 0 1
180883 0 1
180966 1 0
181019 1 0
181319 0 1
181486 0 1
181757 0 1
181800 0 1
182019 1 0
182193 1 0
182243 1 0
182262 0 1
182269 1 0
182496 0 1
182567 0 1
182568 0 1
182604 1 0
182677 0 1
182694 0 1
182880 1 0
182946 0 1
182980 0 1
182984 1 0
183171 1 0
183194 0 1
183292 0 1
183423 0 1
183515 1 0
183559 0 1

257617 0 1
257722 0 1
257770 1 0
257824 0 1
257931 0 1
258242 0 1
258315 1 0
258388 0 1
258442 0 1
258456 1 0
258473 0 1
258575 0 1
258586 1 0
258676 0 1
258680 1 0
258720 0 1
258818 0 1
258853 1 0
258993 0 1
259044 1 0
259051 0 1
259063 1 0
259064 1 0
259095 0 1
259177 1 0
259216 0 1
259232 1 0
259265 1 0
259276 1 0
259336 1 0
259340 0 1
259411 0 1
259443 0 1
259455 1 0
259517 0 1
259550 1 0
259686 0 1
260086 0 1
260136 0 1
260170 1 0
260261 0 1
260352 0 1
260540 0 1
260623 1 0
260630 0 1
260697 1 0
260749 1 0
260874 1 0
260888 0 1
261044 0 1
261163 0 1
261213 0 1
261236 0 1
261350 0 1
261681 0 1
261771 0 1
262081 0 1
262216 1 0
262252 0 1
262374 0 1
262562 1 0
262591 0 1
262672 0 1
262793 1 0
262958 0 1
263124 0 1
263131 1 0
263150 0 1
263289 0 1
263491 0 1
263520 1 0
263529 0 1
263663 1 0
263743 0 1
263897 1 0
263904 0 1
263989 0 1
264128 0 1
264222 1 0
264455 1 0
264570 1 0
264615 0 1
264669 1 0
264746 0 1
264827 0 1
264977 0 1
265038 1 0
265071 1 0
265216 0 1
265263 0 1
265566 0 1

346756 1 0
346831 1 0
346848 1 0
347007 0 1
347045 0 1
347178 0 1
347317 0 1
347392 0 1
347443 1 0
347527 0 1
347593 0 1
347613 1 0
347637 0 1
347653 0 1
347696 1 0
347805 0 1
347894 0 1
347988 0 1
348019 1 0
348198 0 1
348230 0 1
348276 0 1
348315 0 1
348316 1 0
348525 1 0
348690 1 0
348751 1 0
348800 0 1
348844 1 0
348924 1 0
349145 0 1
349305 1 0
349424 0 1
349600 0 1
349623 0 1
349660 0 1
349748 0 1
349805 0 1
350127 0 1
350184 0 1
350216 1 0
350247 0 1
350519 0 1
350648 1 0
350684 0 1
350868 1 0
351041 0 1
351128 0 1
351170 0 1
351229 0 1
351232 1 0
351242 0 1
351440 0 1
351478 0 1
351529 1 0
351629 0 1
351703 0 1
351762 1 0
351954 0 1
351956 1 0
351957 1 0
351978 1 0
351980 1 0
352104 0 1
352143 0 1
352147 0 1
352185 0 1
352267 0 1
352313 0 1
352325 1 0
352369 1 0
352377 0 1
352792 0 1
352895 0 1
352912 0 1
352939 0 1
352985 0 1
353047 0 1
353067 0 1
353069 1 0
353090 1 0
353125 0 1
353255 0 1
353375 0 1
353436 1 0
353443 0 1
353485 1 0
353506 1 0
353546 0 1
353639 1 0
353643 0 1

434152 1 0
434196 0 1
434315 1 0
434373 0 1
434546 0 1
434726 0 1
434731 0 1
434821 1 0
434985 0 1
435027 0 1
435148 1 0
435247 1 0
435447 0 1
435534 0 1
435705 0 1
435732 1 0
435740 1 0
435776 0 1
435809 0 1
436197 0 1
436203 0 1
436276 0 1
436440 0 1
436481 0 1
436624 0 1
436681 0 1
436705 1 0
436833 0 1
436854 0 1
436906 0 1
437020 0 1
437270 0 1
437410 0 1
437576 0 1
437591 0 1
437614 0 1
438097 0 1
438099 0 1
438301 1 0
438349 1 0
438412 1 0
438440 0 1
438603 0 1
438681 0 1
438723 0 1
438783 1 0
438877 0 1
438950 0 1
438998 0 1
439141 1 0
439175 0 1
439221 0 1
439245 1 0
439366 0 1
439379 0 1
439795 0 1
439917 1 0
439934 0 1
440002 1 0
440026 0 1
440129 0 1
440225 1 0
440284 0 1
440290 0 1
440297 0 1
440343 0 1
440354 0 1
440369 0 1
440577 1 0
440598 0 1
440741 1 0
440796 0 1
440797 0 1
440823 0 1
440833 1 0
441093 0 1
441117 1 0
441321 1 0
441334 0 1
441364 0 1
441537 0 1
441619 1 0
441663 0 1
441689 1 0
441765 0 1
441790 1 0
441863 0 1
441940 0 1
442051 0 1
442127 1 0
442205 0 1

In [31]:
len(not_equal_preds)

5741

In [23]:
X_test_tmp.iloc[289]

trustLevel                                                                           1.000000
totalScanTimeInSeconds                                                            1104.000000
grandTotal                                                                           6.580000
lineItemVoids                                                                        1.000000
scansWithoutRegistration                                                             0.000000
quantityModifications                                                                3.000000
scannedLineItemsPerSecond                                                            0.026268
valuePerSecond                                                                       0.005960
lineItemVoidsPerPosition                                                             0.034483
totalScanTimeInSecondsStdNorm                                                        0.356280
totalScanned                                                