In [691]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from utils.ScoreFunction import score_function

random_seed = 42
np.random.seed(random_seed)

## Пример работы скоринговой функции

In [692]:
y_true = [1, 0, 0, 1, 1, 1, 0, 0, 0, 0]
y_pred = [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

In [693]:
(tp,fp,fn,tn)

(1, 2, 3, 4)

In [694]:
score_function(tp,fp,fn,tn)

-60

## Импорт данных

In [716]:
data = pd.read_csv("../data/train.csv",sep='|')
test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

In [717]:
from sklearn.decomposition import PCA,TruncatedSVD,FactorAnalysis

pca = PCA(n_components=8)
pca.fit(data.drop(["fraud"],axis=1).append(test, ignore_index = True))

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [720]:
data.drop(["fraud"],axis=1)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,5,1054,54.70,7,0,3,0.027514,0.051898,0.241379
1,3,108,27.36,5,2,4,0.129630,0.253333,0.357143
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111
5,1,770,11.09,11,5,2,0.033766,0.014403,0.423077
6,3,294,55.63,2,7,1,0.037415,0.189218,0.181818
7,2,1545,22.80,0,8,4,0.006472,0.014757,0.000000
8,6,962,65.44,7,0,2,0.028067,0.068025,0.259259
9,2,725,41.08,10,2,4,0.037241,0.056662,0.370370


In [718]:
pca_ = pca.transform(data.drop(["fraud"],axis=1))
pca_.shape

(1879, 8)

In [697]:
X = np.array(X)
X = np.concatenate((X,pca_),axis=1)
y = np.array(y)

## Создание и обучение линейной регрессии, валидация

In [704]:
nfolds = [5]

In [705]:
score_results, f1_results, recall_results = [], [], []

In [706]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [707]:
np.array(score_results).mean()

61.0

In [708]:
np.array(f1_results).mean()

0.9311497011213404

In [709]:
np.array(recall_results).mean()

0.9142857142857143