# BLEND020
- Automate the Blend Weight Selection
- Only Use 3 Folds

In [2]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def log_mean_absolute_error(y_true, y_pred):
    return np.log(mean_absolute_error(y_true, y_pred))
%matplotlib inline

In [3]:
types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC','3JHH','3JHC', '3JHN']

def get_sub_oof(t):
    """
    Grab the sub and oof files for a give type. 
    """
    oof = []
    sub = []
    model_ids = []
    for file in sorted(os.listdir(f'../type_results/{t}/')):
        if ('sub' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            model_ids.append(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            if len(sub) == 0:
                sub = df.rename(columns={'scalar_coupling_constant': model_id})
            else:
                sub[model_id] = df['scalar_coupling_constant']
        elif ('oof' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            #print(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            if len(oof) == 0:
                oof = df.rename(columns={'oof_preds': model_id})
            else:
                oof[model_id] = df['oof_preds']
    sub = sub.loc[sub['type'] == t]
    oof = oof.loc[oof['type'] == t]
    return sub, oof, model_ids

def get_best_weights(oof, l5, t):
    if len(l5) == 1:
        return [1]
    lmae_best = 0
    for x in tqdm(range(0, 5000)):
        ws = []
        for x in range(0, len(l5)):
            if x == len(l5)-1:
                ws.append(random.randint(1, 50))
            else:
                ws.append(random.randint(0, 50))
        idx = 0
        blend = np.zeros(len(oof[l5[0]]))
        for w in ws:
            blend += w * oof[l5[idx]]
            idx += 1
        blend = blend / np.sum(ws)
        lmae = log_mean_absolute_error(oof['scalar_coupling_constant'],
                            blend)
        if lmae < lmae_best:
            lmae_best = lmae
            bws = ws
    print(f'{t}: Best score {lmae:0.5f} - with weights {bws}')
    return bws # Return the best weights

def create_best_blends():
    oofs = []
    subs = []
    for t in types:
        print(f'{t}: Running for type {t}')
        tsub, toof, tmodel_ids = get_sub_oof(t)
        print(f'{t}: Has model names {tmodel_ids}')
        l5 = tmodel_ids[-5:] # Last 5 models
        bws = get_best_weights(toof, l5, t)
        tot_weight = np.sum(bws)
        tsub['scalar_coupling_constant'] = 0
        toof['oof_blend'] = 0
        idx = 0
        for w in bws:
            tsub['scalar_coupling_constant'] += (w * tsub[l5[idx]])
            toof['oof_blend'] += (w * toof[l5[idx]])
            idx += 1
        tsub['scalar_coupling_constant'] /= tot_weight
        toof['oof_blend'] /= tot_weight
        tsub = tsub[['id','molecule_name','type','scalar_coupling_constant']].copy()
        toof = toof[['id','type','scalar_coupling_constant','oof_blend']].copy()
        subs.append(tsub)
        oofs.append(toof)
    final_oof = pd.concat(oofs).sort_values('id').reset_index(drop=True)
    final_sub = pd.concat(subs).sort_values('id').reset_index(drop=True)
    return final_oof, final_sub
    
final_oof, final_sub = create_best_blends()

1JHC: Running for type 1JHC


  0%|          | 3/5000 [00:00<03:11, 26.16it/s]

1JHC: Has model names ['M047-0.7977', 'M048-0.8258', 'M049-0.791']


100%|██████████| 5000/5000 [02:44<00:00, 30.48it/s]


1JHC: Best score -0.93463 - with weights [30, 33, 31]
2JHH: Running for type 2JHH


  0%|          | 5/5000 [00:00<02:02, 40.87it/s]

2JHH: Has model names ['M031-1.869', 'M032-1.824', 'M032-1.823', 'M047-2.209', 'M048-2.282', 'M049-2.155']


100%|██████████| 5000/5000 [02:05<00:00, 39.73it/s]


2JHH: Best score -2.32185 - with weights [3, 0, 24, 33, 19]
1JHN: Running for type 1JHN


  0%|          | 11/5000 [00:00<00:45, 108.76it/s]

1JHN: Has model names ['M047-1.116', 'M048-1.089', 'M049-1.36', 'M049-1.361']


100%|██████████| 5000/5000 [00:44<00:00, 112.10it/s]


1JHN: Best score -1.37869 - with weights [31, 17, 50, 47]
2JHN: Running for type 2JHN


  0%|          | 8/5000 [00:00<01:09, 72.32it/s]

2JHN: Has model names ['M031-2.04', 'M032-1.962', 'M047-2.12', 'M048-2.082', 'M049-2.117']


100%|██████████| 5000/5000 [01:08<00:00, 73.09it/s]


2JHN: Best score -2.25627 - with weights [38, 13, 41, 28, 49]
2JHC: Running for type 2JHC


  0%|          | 3/5000 [00:00<03:34, 23.26it/s]

2JHC: Has model names ['M047-1.75', 'M048-1.737', 'M049-1.723']


100%|██████████| 5000/5000 [03:24<00:00, 24.44it/s]


2JHC: Best score -1.83025 - with weights [50, 41, 48]
3JHH: Running for type 3JHH


  0%|          | 4/5000 [00:00<02:30, 33.18it/s]

3JHH: Has model names ['M047-2.1', 'M048-2.086', 'M049-2.098']


100%|██████████| 5000/5000 [02:24<00:00, 34.53it/s]


3JHH: Best score -2.22940 - with weights [22, 18, 25]
3JHC: Running for type 3JHC
3JHC: Has model names ['M047-1.612']
3JHN: Running for type 3JHN


  0%|          | 8/5000 [00:00<01:02, 79.86it/s]

3JHN: Has model names ['M047-2.356', 'M048-2.321', 'M049-2.156']


100%|██████████| 5000/5000 [01:03<00:00, 78.85it/s]


3JHN: Best score -2.43677 - with weights [49, 36, 20]


# Score

In [11]:
for i, d in final_oof.groupby('type'):
    score = mean_absolute_error(d['scalar_coupling_constant'], d['oof_blend'])
    lscore = np.log(mean_absolute_error(d['scalar_coupling_constant'], d['oof_blend']))
    print(i,'\t {:0.5f} \t {:0.5f}'.format(score, lscore))
glmae = group_mean_log_mae(final_oof['scalar_coupling_constant'], final_oof['oof_blend'], final_oof['type'])
print('\nGroup LMAE')
print('{:0.5f}'.format(glmae))

1JHC 	 0.38888 	 -0.94447
1JHN 	 0.23374 	 -1.45353
2JHC 	 0.15515 	 -1.86339
2JHH 	 0.08927 	 -2.41613
2JHN 	 0.09968 	 -2.30575
3JHC 	 0.19953 	 -1.61178
3JHH 	 0.10607 	 -2.24364
3JHN 	 0.08742 	 -2.43704

Group LMAE
-1.90947


# Validate Sub

In [5]:
sub_good = pd.read_csv('../submissions/BLEND016_sub_-1.90795CV.csv')
final_sub['good_scc'] = sub_good['scalar_coupling_constant']
final_sub[['scalar_coupling_constant','good_scc']].corr()

Unnamed: 0,scalar_coupling_constant,good_scc
scalar_coupling_constant,1.0,0.999997
good_scc,0.999997,1.0


# Create Submission and OOF Files

In [6]:
import json
import os.path
import re
import ipykernel
import requests

#try:  # Python 3
#    from urllib.parse import urljoin
#except ImportError:  # Python 2
#    from urlparse import urljoin

# Alternative that works for both Python 2 and 3:
from requests.compat import urljoin

try:  # Python 3 (see Edit2 below for why this may not work in Python 2)
    from notebook.notebookapp import list_running_servers
except ImportError:  # Python 2
    import warnings
    from IPython.utils.shimmodule import ShimWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ShimWarning)
        from IPython.html.notebookapp import list_running_servers


def get_notebook_name():
    """
    Return the full path of the jupyter notebook.
    """
    kernel_id = re.search('kernel-(.*).json',
                          ipykernel.connect.get_connection_file()).group(1)
    servers = list_running_servers()
    for ss in servers:
        response = requests.get(urljoin(ss['url'], 'api/sessions'),
                                params={'token': ss.get('token', '')})
        for nn in json.loads(response.text):
            if nn['kernel']['id'] == kernel_id:
                relative_path = nn['notebook']['path']
                return os.path.join(ss['notebook_dir'], relative_path)

In [7]:
BLEND_NUMBER = get_notebook_name().split('/')[-1].replace('.ipynb','').replace('-','')
print(f'NAME TO SAVE {BLEND_NUMBER}')

NAME TO SAVE BLEND020


In [8]:
# Save the Results
final_sub[['id','scalar_coupling_constant']].to_csv(f'../submissions/{BLEND_NUMBER}_sub_{glmae:0.5f}CV.csv', index=False)
final_oof.to_csv(f'../oof/{BLEND_NUMBER}_oof_{glmae:0.5f}CV.csv', index=False)