In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import Normalizer
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

import sys
sys.path.append('..')
from utils.ScoreFunction import score_function, other_score_function

random_seed = 42
np.random.seed(random_seed)
pd.options.mode.chained_assignment = None

selected_features = [
    "trustLevel",
    "trustLevel quantityModifications",
    "trustLevel withoutRegisPerPosition",
    "totalScanTimeInSeconds quantityModifications",
    "lineItemVoids^2",
    "lineItemVoids valuePerSecond",
    "lineItemVoids totalScanned",
    "scansWithoutRegistration avgValuePerScan",
    "valuePerSecond avgValuePerScan withoutRegisPerPosition",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm^2",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm totalScanned",
    "lineItemVoidsPerPosition totalScanTimeInSecondsStdNorm quantityModsPerPosition"
]

def dmc_profit(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    TP = sum((y_true == 1) & (y_pred == 1))
    #TN = sum((cvres['true'] == 0) & (cvres['cvpredict'] == 0))
    FP = sum((y_true == 0) & (y_pred == 1))
    FN = sum((y_true == 1) & (y_pred == 0))
    profit = 5*TP -25*FP -5*FN
    return profit

## Импорт данных

In [2]:
data = pd.read_csv("../data/train.csv",sep='|')
X_test = pd.read_csv("../data/test.csv",sep='|')
X = data.drop(["fraud"],axis=1)
y = data["fraud"]

THRESHOLDS = {'scannedLineItemsPerSecond':1, 'valuePerSecond':1, 'lineItemVoidsPerPosition':1}
#Вот это ухудшает скор на валидации, НО выравнивает распределение трейна и теста по данным фичам
for x in list(THRESHOLDS.keys()):
    X[x] = X[x].clip(lower=X[x].quantile(0.01),upper=X[x].quantile(0.97))
    X_test[x] = X_test[x].clip(lower=X_test[x].quantile(0.01),upper=X_test[x].quantile(0.97))
    
    
X['totalScanTimeInSecondsStdNorm'] = (X['totalScanTimeInSeconds'] - X['totalScanTimeInSeconds'].mean())/X['totalScanTimeInSeconds'].std()

X['totalScanned'] = X['scannedLineItemsPerSecond']*X['totalScanTimeInSeconds']
X['avgTimePerScan'] = 1/X['scannedLineItemsPerSecond']
X['avgValuePerScan'] = X['avgTimePerScan']*X['valuePerSecond']
X['withoutRegisPerPosition'] = X['scansWithoutRegistration']*X['totalScanned']
X['quantityModsPerPosition'] = X['quantityModifications']/X['totalScanned']

X_test['totalScanTimeInSecondsStdNorm'] = (X_test['totalScanTimeInSeconds'] - X_test['totalScanTimeInSeconds'].mean())/X_test['totalScanTimeInSeconds'].std()

X_test['totalScanned'] = X_test['scannedLineItemsPerSecond']*X_test['totalScanTimeInSeconds']
X_test['avgTimePerScan'] = 1/X_test['scannedLineItemsPerSecond']
X_test['avgValuePerScan'] = X_test['avgTimePerScan']*X_test['valuePerSecond']
X_test['withoutRegisPerPosition'] = X_test['scansWithoutRegistration']*X_test['totalScanned']
X_test['quantityModsPerPosition'] = X_test['quantityModifications']/X_test['totalScanned']

cols = list(X.columns)+[1,2]

### Polynom features

In [None]:
# generate features and rescale
polyFeatures = PolynomialFeatures(3, interaction_only=False)
polyFeatures.fit(X.append(X_test, ignore_index = True))

X_poly = polyFeatures.transform(X)
X_test_poly = polyFeatures.transform(X_test)

# remove the first var because it is the constant term
X_poly = X_poly[:,1:]
X_test_poly = X_test_poly[:,1:]

features = polyFeatures.get_feature_names(input_features=X.columns)[1:]

In [None]:
X_poly = pd.DataFrame(X_poly, columns=features)
X_test_poly = pd.DataFrame(X_test_poly, columns=features)

X_tmp = X.copy()
X_test_tmp = X_test.copy()

for f in selected_features:
    X_tmp = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
    X_test_tmp = pd.concat([X_test_tmp,pd.Series(X_test_poly[f])], axis=1)

### Процесс отбора полиномиальных фичей

In [None]:
def fast_evaluate(X,y):
    X = np.array(X)
    y = np.array(y)
    score_results = []
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
    return sum(score_results)

In [None]:
"""score = -10000
last_score = -10000
selected_features = []

for f in features:
    X_check = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
    score = fast_evaluate(X_check, y)
   
    if score > last_score:
        X_tmp = pd.concat([X_tmp,pd.Series(X_poly[f])], axis=1)
        selected_features.append(f)
        last_score = score    
        print(last_score,f)"""

In [None]:
X_tmp.head()

In [None]:
X = np.array(X_tmp)
y = np.array(y)

In [None]:
def profit_scorer(y, y_pred):
        profit_matrix = {(0,0): 0, (0,1): -5, (1,0): -25, (1,1): 5}
        return sum(profit_matrix[(pred, actual)] for pred, actual in zip(y_pred, y))

## Создание и обучение лог регрессии, валидация

In [None]:
nfolds = [3,5,7,10]

In [None]:
score_results, score2_results, f1_results, recall_results = [], [], [], []

In [None]:
for n_folds in nfolds:
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    for train_index, test_index in skf.split(X, y):
        clf = LogisticRegression(random_state=random_seed)
        clf.fit(X[train_index], y[train_index])
        y_pred = clf.predict(X[test_index])
        tn, fp, fn, tp = confusion_matrix(y[test_index], y_pred).ravel()
        score_results.append(score_function(tp,fp,fn,tn))
        score2_results.append(dmc_profit(y[test_index], y_pred))
        f1_results.append(f1_score(y[test_index], y_pred))
        recall_results.append(recall_score(y[test_index], y_pred))

In [None]:
(np.array(score_results).mean(),np.array(score2_results).mean(),np.array(f1_results).mean(),np.array(recall_results).mean())

In [None]:
other_score_function(X,y,LogisticRegression(random_state=random_seed))

### Результаты бейзлайна следующие (кросс-валидация на 3,5,7,10 фолдах):
### - Наш скор: 41 евро
### - Не наш скор: 29 евро
### - F1: 0.919
### - recall: 0.902

### Чтобы модель прошла проверку необходимо побить ВСЕ эти параметры