In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from utils.ScoreFunction import score_function, other_score_function

random_seed = 42
np.random.seed(random_seed)
pd.options.mode.chained_assignment = None

## Импорт данных

In [14]:
data = pd.read_csv("../data/train.csv",sep='|')
test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

#выбираем такие трешхолды в соотв. со статистиками из data_exploring
THRESHOLDS = {'scannedLineItemsPerSecond':1, 'valuePerSecond':1, 'lineItemVoidsPerPosition':1}
for f in list(THRESHOLDS.keys()):
    indicator = X[f] > THRESHOLDS[f]
    X[f].loc[indicator] = X[f].mean()

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

In [15]:
from sklearn.decomposition import PCA,TruncatedSVD,FactorAnalysis

pca = PCA(n_components=2)
pca.fit(data.drop(["fraud"],axis=1).append(test, ignore_index = True))

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [16]:
pca_ = pca.transform(data.drop(["fraud"],axis=1))

In [17]:
X = np.array(X)
X = np.concatenate((X,pca_),axis=1)
y = np.array(y)

## Создание и обучение линейной регрессии, валидация

In [18]:
nfolds = [3, 5, 7, 10]

In [19]:
score_results, f1_results, recall_results = [], [], []

In [20]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [21]:
np.array(score_results).mean()

40.0

In [22]:
np.array(f1_results).mean()

0.9183750294297465

In [23]:
np.array(recall_results).mean()

0.9025118411000764

In [24]:
other_score_function(X,y,LogisticRegression(random_state=random_seed))

(29.0, False)

### Результаты бейзлайна следующие (кросс-валидация на 3,5,7,10 фолдах):
### - Наш скор: 41 евро
### - Не наш скор: 29 евро
### - F1: 0.919
### - recall: 0.902

### Чтобы модель прошла проверку необходимо побить ВСЕ эти параметры