In [196]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from utils.ScoreFunction import score_function, other_score_function

random_seed = 42
np.random.seed(random_seed)
pd.options.mode.chained_assignment = None

## Импорт данных

In [759]:
data = pd.read_csv("../data/train.csv",sep='|')
test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

#выбираем такие трешхолды в соотв. со статистиками из data_exploring
THRESHOLDS = {'scannedLineItemsPerSecond':1, 'valuePerSecond':1, 'lineItemVoidsPerPosition':1}
for x in list(THRESHOLDS.keys()):
    data[x] = data[x].clip(lower=data[x].quantile(0.05),upper=data[x].quantile(0.95))
    #indicator = X[f] > THRESHOLDS[f]
    #X[f].loc[indicator] = X[f].mean()

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

X['grandTotalCat'] = pd.cut(X['grandTotal'], 10,labels =[1,2,3,4,5,6,7,8,9,10])
X['totalScanTimeInSecondsCat'] = pd.cut(X['totalScanTimeInSeconds'], 2,labels =[1,2])
X['lineItemVoidsPerPositionCat'] = pd.cut(X['lineItemVoidsPerPosition'], 10,labels =[1,2,3,4,5,6,7,8,9,10])
X['avgTimePerScan'] = pd.cut(X['avgTimePerScan'], 4,labels =[1,2,3,4])

In [760]:
X.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScanned,avgTimePerScan,avgValuePerScan,withoutRegisPerPosition,quantityModsPerPosition,grandTotalCat,totalScanTimeInSecondsCat,lineItemVoidsPerPositionCat
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,29.0,1,1.886207,0.0,0.103448,6,2,1
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,14.0,1,1.954286,28.0,0.285714,3,1,1
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,13.0,1,4.781538,130.0,0.384615,7,2,1
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,29.0,1,3.183103,116.0,0.137931,10,2,1
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,27.0,1,3.01963,189.0,0.074074,9,1,1


In [761]:
from sklearn.decomposition import PCA,TruncatedSVD,FactorAnalysis

pca = PCA(n_components=2)
pca.fit(data.drop(["fraud"],axis=1).append(test, ignore_index = True))
pca_ = pca.transform(data.drop(["fraud"],axis=1))

In [762]:
X = np.array(X)
X = np.concatenate((X,pca_),axis=1)
y = np.array(y)

In [763]:
def profit_scorer(y, y_pred):
        profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
        return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

## Создание и обучение лог регрессии, валидация

In [764]:
nfolds = [3, 5, 7, 10]

In [765]:
score_results, f1_results, recall_results = [], [], []

In [766]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [767]:
(np.array(score_results).mean(),np.array(f1_results).mean(),np.array(recall_results).mean())

(51.4, 0.9382007683261583, 0.9247975553857907)

In [758]:
other_score_function(X,y,LogisticRegression(random_state=random_seed))

(32.5, True)

### Результаты бейзлайна следующие (кросс-валидация на 3,5,7,10 фолдах):
### - Наш скор: 41 евро
### - Не наш скор: 29 евро
### - F1: 0.919
### - recall: 0.902

### Чтобы модель прошла проверку необходимо побить ВСЕ эти параметры