In [1]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
import numpy as np
def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
%matplotlib inline

test = pd.read_csv('../input/test.csv')

In [13]:
# # Loop through each file. Make sure it's only the type that is saved in the results file
# for fold in tqdm(os.listdir('../type_results/')):
#     for file in os.listdir(f'../type_results/{fold}'):
#         if 'sub' in file:
#             df = df.loc[df['type'] == fold]
#             df.to_parquet(f'../type_results/{fold}/{file}')

In [14]:
def save_res_types_folder(type_, oof, sub, fi, file):
    """
    Saves the oof, sub, and fi files int he type_results folder with correct naming convention
    """
    model_name_runid = file.split('_oof')[0]
    for exist_file in os.listdir(f'../type_results/{type_}/'):
        # Check to see if results already list
        if model_name_runid in exist_file:
            if 'LMAE' is exist_file:
                print('file {exist_file} already exists')
                return

    oof_type = oof.loc[oof['type'] == type_]
    score = mean_absolute_error(oof_type['scalar_coupling_constant'],oof_type['oof_preds'])
    logscore = np.log(score)
    if score > 1:
        print(f'No predictions for {type_}')
        return
    print(f'===== running for type {type_} - mae {score} - log mae {logscore}')
    
    oof_type = oof.loc[oof['type'] == type_]
    
    sub_type = test[['id','molecule_name','type']].merge(sub, on='id')
    sub_type = sub_type.loc[sub_type['type'] == type_]
    if np.sum(sub_type['scalar_coupling_constant'] == 0) > 10:
        print('ERROR! Sub has to many zero predictions')
        return
    expected_len = len(test.loc[test['type'] == type_])
    if expected_len != len(sub_type):
        print("ERRROR LENGTHS NOT THE SAME")
        return
        
    # Name Files and save
    model_type = file.split('_oof')[1].split('_')[1]
    nfolds = [x for x in file.split('_oof')[1].split('_') if 'fold' in x][0]
    niter = [x for x in file.split('_oof')[1].split('_') if 'iter' in x][0]
    lr = [x for x in file.split('_oof')[1].split('_') if 'lr' in x][0].replace('.csv','')
    fn_template = '../type_results/{}/{}_{}_XXXXXXX_{:0.4f}MAE_{:0.4}LMAE_{}_{}_{}_{}.parquet'.format(type_,
                                                                                                      model_name_runid,
                                                                                                      type_,
                                                                                                      score,
                                                                                                      logscore,
                                                                                                      model_type,
                                                                                                      nfolds,
                                                                                                       niter,
                                                                                                       lr)
    sub_name = fn_template.replace('XXXXXXX','submission')
    oof_name = fn_template.replace('XXXXXXX','oof')

    print(sub_name)
    print(oof_name)
    sub_type.to_parquet(sub_name)
    oof_type.to_parquet(oof_name)

    if fi is not None:
        fi_type = fi.loc[fi['type'] == type_]
        fi_name = fn_template.replace('XXXXXXX','fi')
        print(fi_name)
        fi_type.to_parquet(fi_name)

In [17]:
types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC','3JHH','3JHC', '3JHN']

for file in tqdm(os.listdir('../oof/')):
    if 'M047' in file:
        print(f'====== Running for file {file}')
        if 'MERGE' not in file:
            print(file)
            oof = pd.read_csv(f'../oof/{file}')
            sub = pd.read_csv('../submissions/{}'.format(file.replace('oof','submission').replace('catboost','lgb')))
            for fi_file in os.listdir('../fi/'):
                if file.split('_oof')[0] in fi_file:
                    #print(f'found it {fi_file}')
                    try:
                        fi = pd.read_csv(f'../fi/{fi_file}')
                    except:
                        print(f'Cant read fi file {fi_file}')
            for type_ in types:
                save_res_types_folder(type_, oof, sub, fi, file)


  0%|          | 0/72 [00:00<?, ?it/s][A

M047_0714_2226_oof_catboost_3folds_-1.7577CV_500000iter_0.1lr.csv
===== running for type 1JHC - mae 0.4503699367015481 - log mae -0.7976859523827422
../type_results/1JHC/M047_0714_2226_1JHC_submission_0.4504MAE_-0.7977LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/1JHC/M047_0714_2226_1JHC_oof_0.4504MAE_-0.7977LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/1JHC/M047_0714_2226_1JHC_fi_0.4504MAE_-0.7977LMAE_catboost_3folds_500000iter_0.1lr.parquet
===== running for type 2JHH - mae 0.10975865602221349 - log mae -2.2094713597751054
../type_results/2JHH/M047_0714_2226_2JHH_submission_0.1098MAE_-2.209LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/2JHH/M047_0714_2226_2JHH_oof_0.1098MAE_-2.209LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/2JHH/M047_0714_2226_2JHH_fi_0.1098MAE_-2.209LMAE_catboost_3folds_500000iter_0.1lr.parquet
===== running for type 1JHN - mae 0.3275648329377529 - log mae -1.1160692803302417
../type_results/1JHN/M047_0


  6%|▌         | 4/72 [00:15<04:29,  3.96s/it][A
100%|██████████| 72/72 [00:15<00:00,  4.55it/s][A

../type_results/3JHN/M047_0714_2226_3JHN_submission_0.0948MAE_-2.356LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/3JHN/M047_0714_2226_3JHN_oof_0.0948MAE_-2.356LMAE_catboost_3folds_500000iter_0.1lr.parquet
../type_results/3JHN/M047_0714_2226_3JHN_fi_0.0948MAE_-2.356LMAE_catboost_3folds_500000iter_0.1lr.parquet


In [None]:
# for folder in tqdm(os.listdir('../temp/')):
#     for file in os.listdir(f'../temp/{folder}/'):
#         if 'M047' in file:
#             if 'oof' in file:
#                 for type_ in types:
#                     oof = pd.read_csv(f'../temp/{folder}/{file}') 
#                     sub = pd.read_csv('../temp/{}/{}'.format(folder, file.replace('oof','submission')))
#                     try:
#                         fi = pd.read_csv('../temp/{}/{}'.format(folder, file.replace('oof','fi'))) 
#                     except:
#                         print('No feature importance')
#                         fi = None
#                     file_new = '_'.join(file.split('_')[1:])
#                     save_res_types_folder(type_, oof, sub, fi, file_new)

In [15]:
# # types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC','3JHH','3JHC', '3JHN']
# types = ['3JHN']
# # Pick up where we left off
# for folder in tqdm(os.listdir('../temp/')):
#     for file in os.listdir(f'../temp/{folder}/'):
#         if 'M047' in file:
#             if 'oof' in file:
#                 for type_ in types:
#                     oof = pd.read_csv(f'../temp/{folder}/{file}') 
#                     sub = pd.read_csv('../temp/{}/{}'.format(folder, file.replace('oof','submission')))
#                     try:
#                         fi = pd.read_csv('../temp/{}/{}'.format(folder, file.replace('oof','fi'))) 
#                     except:
#                         print('No feature importance')
#                         fi = None
#                     file_new = '_'.join(file.split('_')[1:])
#                     save_res_types_folder(type_, oof, sub, fi, file_new)

  0%|          | 0/44 [00:00<?, ?it/s]

No predictions for 3JHN
No predictions for 3JHN
No predictions for 3JHN
No predictions for 3JHN
No predictions for 3JHN
No predictions for 3JHN


100%|██████████| 44/44 [00:18<00:00,  2.37it/s]

No predictions for 3JHN





In [6]:
# types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC','3JHH','3JHC', '3JHN']

# for file in tqdm(os.listdir('../oof/')):
#     if 'M046' in file:
#         print(f'====== Running for file {file}')
#         if 'MERGE' not in file:
#             print(file)
#             oof = pd.read_csv(f'../oof/{file}')
#             sub = pd.read_csv('../submissions/{}'.format(file.replace('oof','submission').replace('catboost','lgb')))
#             for fi_file in os.listdir('../fi/'):
#                 if file.split('_oof')[0] in fi_file:
#                     #print(f'found it {fi_file}')
#                     try:
#                         fi = pd.read_csv(f'../fi/{fi_file}')
#                     except:
#                         print(f'Cant read fi file {fi_file}')
#             for type_ in types:
#                 save_res_types_folder(type_, oof, sub, fi, file)