In [None]:
%%capture
!pip install lightautoml

In [None]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Pochta/train_dataset_train.csv')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df

In [None]:
df["is_in_yandex"] = pd.Categorical(df["is_in_yandex"])
df["is_in_yandex"].astype('category').cat.codes
df["is_in_yandex"] = df["is_in_yandex"].cat.codes

In [None]:
df["is_return"] = pd.Categorical(df["is_return"])
df["is_return"].astype('category').cat.codes
df["is_return"] = df["is_return"].cat.codes

In [None]:
df

In [None]:
def op_t(x):
    return x[:x.find('_')]
def op_a(x):
    return x[x.find('_')+1:]

In [None]:
df['op_t'] = df['oper_type + oper_attr'].apply(op_t).astype('int64')
df['op_a'] = df['oper_type + oper_attr'].apply(op_a).astype('int64')

In [None]:
N_THREADS = 32
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 900*4 # equal to 15*2 minutes
TARGET_NAME = 'label'

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
tr_data, te_data = train_test_split(
    df, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

In [None]:
task = Task('binary', loss = 'logloss', metric = 'logloss')

In [None]:
roles = {
    'target': TARGET_NAME,
    'drop': ['id','name_mfi','oper_type + oper_attr','index_oper']
}

In [None]:
automl = TabularAutoML(
    task = task, 
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [None]:
%%time 
oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 1)

In [None]:
tr_data

In [None]:
5+5

In [None]:
te_pred = automl.predict(te_data)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(te_data['label'].values,te_pred.data[:,0])

In [None]:
te_data['label'].value_counts()

In [None]:
1 - 34330/1165670

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

In [None]:
5**9

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Pochta/test_dataset_test.csv')

In [None]:
test_df

In [None]:
test_df["is_in_yandex"] = pd.Categorical(test_df["is_in_yandex"])
test_df["is_in_yandex"].astype('category').cat.codes
test_df["is_in_yandex"] = test_df["is_in_yandex"].cat.codes

test_df["is_return"] = pd.Categorical(test_df["is_return"])
test_df["is_return"].astype('category').cat.codes
test_df["is_return"] = test_df["is_return"].cat.codes


In [None]:
test_df['op_t'] = test_df['oper_type + oper_attr'].apply(op_t).astype('int64')
test_df['op_a'] = test_df['oper_type + oper_attr'].apply(op_a).astype('int64')

In [None]:
test_df

In [None]:
end_test_pred = automl.predict(test_df)

In [None]:
end_test_pred

In [None]:
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score


In [None]:


metric = pd.DataFrame(columns = ['alpha','precision','recall','f1','accuracy','roc_auc'])

for x in np.array(list(range(20)))/20 :
    row = []
    tr = te_data['label'].values
    pr = 1*(te_pred.data[:,0] >= x)
    row.append(x)

    row.append(precision_score(te_data['label'].values ,pr))
    row.append(recall_score(te_data['label'].values ,pr))
    row.append(f1_score(te_data['label'].values ,pr))
    row.append(accuracy_score(te_data['label'].values ,pr))

    fp = np.sum((pr == 1) & (tr == 0))
    tp = np.sum((pr == 1) & (tr == 1))

    fn = np.sum((pr == 0) & (tr == 1))
    tn = np.sum((pr == 0) & (tr == 0))

    fpr = (fp / (fp + tn))
    tpr = (tp / (tp + fn))

    roc = (1+ tpr - fpr)/2 


    row.append(roc)

    metric.loc[len(metric.index)] = row



In [None]:
metric

In [None]:
metric

In [None]:
te_pred_data

In [None]:
te_data['label'].value_counts()

In [None]:
sum(1*(end_test_pred.data[:,0] >= 0.1))

In [None]:
submission = pd.DataFrame(columns = ['id','label'])

In [None]:
submission['id'] = test_df['id'].values

In [None]:
submission['label'] = 1*(end_test_pred.data[:,0] >= 0.2)
#submission['label'] = end_test_pred.data[:,0]

In [None]:
submission['label'].value_counts()

In [None]:
submission.to_csv('submission_4_11_4.csv',index = False)

In [None]:
submission

In [None]:
sum(end_test_pred.data[:,0] < 0.1)

In [None]:
precision_score([0,1,0,1,0,1,1],[0,0,0,0,0,0,0]),recall_score([0,1,0,1,0,1,1],[1,1,1,1,1,1,1])

In [None]:
precision_score