In [12]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score, confusion_matrix, matthews_corrcoef
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
import xgboost as xgb
from utils import score
import pickle

In [13]:
label = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/train_y_answer.csv')
custinfo = pd.read_csv('E:/Datasets/Fintech/TrainingDataset_first/public_train_x_custinfo_full_hashed.csv')
ds = pd.read_csv(f"E:/Datasets/Fintech/AggregatedData/timeSeriesStats.csv")
custs = custinfo.cust_id.unique()

In [14]:
train_ds = ds[ds.alert_key.isin(label.alert_key)]
pub_ds = ds[~ds.alert_key.isin(label.alert_key)]

Add label

In [15]:
sar_flags = []
for i in range(len(train_ds)):
    ak = train_ds.iloc[i]['alert_key']
    sar_flag = label[label.alert_key==ak].sar_flag.item()
    sar_flags.append(sar_flag)
train_ds['sar_flag'] = sar_flags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
X = train_ds.iloc[:,:-2].values
y = train_ds.iloc[:,-1].values

In [17]:
pca = PCA()
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
pca.fit(X_std)

PCA()

RFE Feature selection

In [18]:
skf = StratifiedKFold(n_splits=5)
feat_freq = np.zeros(train_ds.iloc[:,:-2].values.shape[1])
X = train_ds.iloc[:,:-2].values
y = train_ds.iloc[:,-1].values
for idx, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    train_ds_cv = train_ds.iloc[train_idx,:]
    val_ds_cv = train_ds.iloc[val_idx,:]
    X_train = train_ds_cv.iloc[:,:-2].values
    X_test = val_ds_cv.iloc[:,:-2].values
    y_train = train_ds_cv.iloc[:,-1].values
    y_test = val_ds_cv.iloc[:,-1].values
    
    #'''
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)
    #'''

    for i in range(5):
        estimator = ExtraTreesClassifier()
        selector = RFE(estimator=estimator, step=10, n_features_to_select=100)
        selector.fit(X_train, y_train)
        feat_freq = feat_freq + (0+selector.support_)
top_feats = np.argsort(feat_freq)[-100:]

In [19]:
with open('./tsStats_result/top_feats', 'wb') as f:
    pickle.dump(top_feats, f)

with open('./tsStats_result/top_feats', 'rb') as f:
    top_feats = pickle.load(f)

In [20]:
skf = StratifiedKFold(n_splits=5)

y_true = []
y_pred = []
fold_result = []
for idx, (train_idx, val_idx) in enumerate(skf.split(X,y)):
    train_ds_cv = train_ds.iloc[train_idx,:]
    val_ds_cv = train_ds.iloc[val_idx,:]
    X_train = train_ds_cv.iloc[:,:-2].values
    X_val = val_ds_cv.iloc[:,:-2].values
    y_train = train_ds_cv.iloc[:,-1].values
    y_val = val_ds_cv.iloc[:,-1].values
    ak_train = train_ds_cv.iloc[:,-2:]
    ak_val = val_ds_cv.iloc[:,-2:]
    
    #'''
    sc = StandardScaler()
    sc.fit(X)
    X_train = sc.transform(X_train)
    X_val = sc.transform(X_val)
    #'''

    # X_train = X_train[:,top_feats]
    # X_val = X_val[:,top_feats]
    pca = PCA(n_components=30)
    pca.fit(X_std)
    X_train = pca.transform(X_train)
    X_val = pca.transform(X_val)
     
    if idx==0:
        print(X_train.shape)
        print(X_val.shape)
    
    #clf = xgb.XGBClassifier(max_depth=10, sample_weight=99, verbosity=0)
    #clf = SVC(C=1e+2, class_weight='balanced', kernel='rbf', probability=True)
    clf = RandomForestClassifier(class_weight='balanced')
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_val)
    y_pred = ak_val.iloc[:,0:1] # alert_key
    y_pred['prob'] = y_prob[:,1]
    print(f"Fold-{idx+1}, score: {score(ak_val, y_pred)}")



(17528, 30)
(4382, 30)
Fold-1, score: (0.012605042016806723, 45, 3569)
Fold-2, score: (0.012077294685990338, 45, 3725)
Fold-3, score: (0.01322219028456453, 46, 3478)
Fold-4, score: (0.010856738258201559, 46, 4236)
Fold-5, score: (0.013513513513513514, 46, 3403)


In [11]:
X_train.shape

(17528, 100)

### Upload

In [44]:
X_train = train_ds.iloc[:,:-2].values
y_train = train_ds.iloc[:,-1].values
X_test = pub_ds.iloc[:,:-1].values
y_test = pub_ds.iloc[:,-1].values
ak_test = pub_ds.iloc[:,-1:]

sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

X_train = X_train[:,top_feats]
X_test = X_test[:,top_feats]

clf = SVC(C=1e+0, class_weight='balanced', kernel='rbf', probability=True)
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)
y_pred = ak_test.iloc[:,0:1] # alert_key
y_pred['prob'] = y_prob[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
y_pred

Unnamed: 0,alert_key,prob
9,352249.0,0.004724
10,356565.0,0.004971
11,357209.0,0.004791
12,358001.0,0.005006
13,359220.0,0.005022
...,...,...
7136,364977.0,0.005173
7137,364978.0,0.026288
7139,364983.0,0.007697
7140,364986.0,0.025867


In [19]:
output = pd.read_csv('./sample.csv')
output.loc[:,'probability'] = 0

In [21]:
for ak, prob in y_pred.values:
    idx = output[output.alert_key==ak].index
    output.loc[idx, 'probability'] = prob

In [23]:
output.to_csv('./upload/tsStatsV1.csv', index=False)

In [36]:
pd.read_csv('./sample.csv')

Unnamed: 0,alert_key,probability
0,357307,0.000017
1,376329,0.000324
2,373644,0.000372
3,357668,0.000489
4,354443,0.000526
...,...,...
3845,364485,0.997702
3846,363155,0.998987
3847,368710,0.999694
3848,358067,0.999821
