In [691]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from utils.ScoreFunction import score_function

random_seed = 42
np.random.seed(random_seed)

## Пример работы скоринговой функции

In [692]:
y_true = [1, 0, 0, 1, 1, 1, 0, 0, 0, 0]
y_pred = [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

In [693]:
(tp,fp,fn,tn)

(1, 2, 3, 4)

In [694]:
score_function(tp,fp,fn,tn)

-60

## Импорт данных

In [695]:
data = pd.read_csv("../data/train.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

In [696]:
from sklearn.decomposition import PCA,TruncatedSVD,FactorAnalysis

pca = PCA(n_components=8)
pca_ = pca.fit_transform(data.drop(["fraud"],axis=1))

In [697]:
X = np.array(X)
X = np.concatenate((X,pca_),axis=1)
y = np.array(y)

## Создание и обучение линейной регрессии, валидация

In [704]:
nfolds = [5]

In [705]:
score_results, f1_results, recall_results = [], [], []

In [706]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [707]:
np.array(score_results).mean()

61.0

In [708]:
np.array(f1_results).mean()

0.9311497011213404

In [709]:
np.array(recall_results).mean()

0.9142857142857143