# BLEND017
July 21st Best

In [1]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def log_mean_absolute_error(y_true, y_pred):
    return np.log(mean_absolute_error(y_true, y_pred))
%matplotlib inline

In [23]:
types = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC','3JHH','3JHC', '3JHN']

def get_sub_oof(t):
    oof = []
    sub = []
    model_ids = []
    for file in sorted(os.listdir(f'../type_results/{t}/')):
        if 'sub' in file:
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            model_ids.append(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            if len(sub) == 0:
                sub = df.rename(columns={'scalar_coupling_constant': model_id})
            else:
                sub[model_id] = df['scalar_coupling_constant']
        elif 'oof' in file:
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            #print(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            if len(oof) == 0:
                oof = df.rename(columns={'oof_preds': model_id})
            else:
                oof[model_id] = df['oof_preds']
    return sub, oof, model_ids

def get_best_weights(oof, l5):
    lmae_best = 0
    for x in tqdm(range(0, 1000)):
        w1 = random.randint(0, 50)
        w2 = random.randint(0, 50)
        w3 = random.randint(0, 50)
        w4 = random.randint(0, 50)
        w5 = random.randint(1, 50)
        blend = (\
            w1 * oof[l5[0]] + \
            w2 * oof[l5[1]] + \
            w3 * oof[l5[2]] + \
            w4 * oof[l5[3]] + \
            w5 * oof[l5[4]]) / (w1 + w2 + w3 + w4 + w5)
        lmae = log_mean_absolute_error(oof['scalar_coupling_constant'],
                            blend)
        if lmae < lmae_best:
            lmae_best = lmae
            bw1 = w1
            bw2 = w2
            bw3 = w3
            bw4 = w4
            bw5 = w5
    print(f'Best score {lmae:0.5f} - with weights {w1} {w2} {w3} {w4} {w5}')
    return bw1, bw2, bw3, bw4, bw5 # Return the best weights

def create_best_blends():
    oofs = []
    subs = []
    for t in types:
        print(f'Running for type {t}')
        tsub, toof, tmodel_ids = get_sub_oof(t)
        print(f'Has model names {tmodel_ids}')
        l5 = tmodel_ids[-5:] # Last 5 models
        bw1, bw2, bw3, bw4, bw5 = get_best_weights(toof, l5)
        tot_weight = np.sum([bw1, bw2, bw3, bw4, bw5])
        tsub['scalar_coupling_constant'] = (bw1 * tsub[l5[0]] + bw2 * tsub[l5[1]]
                                           + bw3 * tsub[l5[2]] + bw4 * tsub[l5[3]]
                                           + bw5 * tsub[l5[4]]) / tot_weight 
        toof['oof_blend'] = (bw1 * toof[l5[0]] + bw2 * toof[l5[1]]
                                           + bw3 * toof[l5[2]] + bw4 * toof[l5[3]]
                                           + bw5 * toof[l5[4]]) / tot_weight
        tsub = tsub[['id','molecule_name','type','scalar_coupling_constant']] 
        toof = toof[['id','type','scalar_coupling_constant','oof_blend']]
        subs.append(tsub)
        oofs.append(toof)
    final_oof = pd.concat(oofs).sort_values('id').reset_index(drop=True)
    final_sub = pd.concat(subs).sort_values('id').reset_index(drop=True)
    return final_oof, final_sub
    
final_oof, final_sub = create_best_blends()

Running for type 1JHC


  0%|          | 3/1000 [00:00<00:36, 27.20it/s]

Has model names ['M037-0.5176', 'M038-0.532', 'M040-0.5477', 'M044-0.6206', 'M045-0.6455', 'M046-0.698', 'M047-0.7977', 'M048-0.8258']


100%|██████████| 1000/1000 [00:33<00:00, 29.60it/s]


Best score -0.87014 - with weights 12 10 19 42 2
Running for type 2JHH


  0%|          | 5/1000 [00:00<00:23, 42.14it/s]

Has model names ['M031-1.869', 'M032-1.824', 'M032-1.823', 'M036-1.898', 'M037-1.938', 'M038-1.963', 'M039-1.938', 'M040-1.951', 'M042-1.946', 'M046-2.078', 'M047-2.209', 'M048-2.282', 'M049-2.155']


100%|██████████| 1000/1000 [00:23<00:00, 43.56it/s]


Best score -2.40627 - with weights 18 0 28 43 50
Running for type 1JHN


  1%|▏         | 13/1000 [00:00<00:08, 120.16it/s]

Has model names ['M035-0.9231', 'M035-0.9557', 'M036-0.1603', 'M036-0.9489', 'M037-0.8456', 'M038-0.9475', 'M039-0.8116', 'M040-0.9207', 'M042-0.9519', 'M046-1.031', 'M047-1.116', 'M048-1.089', 'M049-1.36', 'M049-1.361']


100%|██████████| 1000/1000 [00:08<00:00, 120.62it/s]


Best score -1.39648 - with weights 17 23 29 0 25
Running for type 2JHN


  1%|          | 8/1000 [00:00<00:12, 78.64it/s]

Has model names ['M031-2.04', 'M032-1.962', 'M036-1.928', 'M037-1.953', 'M037-1.953', 'M038-1.979', 'M039-1.949', 'M039-1.941', 'M040-1.942', 'M042-1.926', 'M046-2.04', 'M047-2.12', 'M048-2.082', 'M049-2.117']


100%|██████████| 1000/1000 [00:12<00:00, 78.91it/s]


Best score -2.26036 - with weights 3 21 46 46 11
Running for type 2JHC


  0%|          | 2/1000 [00:00<00:52, 18.84it/s]

Has model names ['M037-1.49', 'M039-1.452', 'M040-1.518', 'M041-1.449', 'M046-1.631', 'M047-1.75', 'M048-1.737']


100%|██████████| 1000/1000 [00:49<00:00, 20.08it/s]


Best score -1.77910 - with weights 24 21 16 31 1
Running for type 3JHH


  0%|          | 4/1000 [00:00<00:30, 32.18it/s]

Has model names ['M037-1.873', 'M038-1.899', 'M039-1.872', 'M040-1.897', 'M042-1.881', 'M046-1.978', 'M047-2.1', 'M048-2.086', 'M049-2.098']


100%|██████████| 1000/1000 [00:31<00:00, 32.04it/s]


Best score -2.24877 - with weights 25 2 35 20 39
Running for type 3JHC


  0%|          | 2/1000 [00:00<00:58, 17.14it/s]

Has model names ['M037-1.328', 'M039-1.32', 'M040-1.403', 'M042-1.403', 'M045-1.423', 'M046-1.507', 'M047-1.612']


100%|██████████| 1000/1000 [00:53<00:00, 18.80it/s]


Best score -1.62928 - with weights 38 39 36 43 28
Running for type 3JHN


  1%|          | 8/1000 [00:00<00:13, 71.84it/s]

Has model names ['M037-2.14', 'M038-2.165', 'M039-2.152', 'M039-2.155', 'M040-2.156', 'M041-2.087', 'M046-2.25', 'M047-2.356', 'M048-2.321', 'M049-2.156']


100%|██████████| 1000/1000 [00:14<00:00, 69.00it/s]


Best score -2.37631 - with weights 21 2 7 10 8


# Score

In [26]:
for i, d in final_oof.groupby('type'):
    print(i, '{:0.5f}'.format(mean_absolute_error(d['scalar_coupling_constant'], d['oof_blend'])))
glmae = group_mean_log_mae(final_oof['scalar_coupling_constant'], final_oof['oof_blend'], final_oof['type'])
print('\nGroup LMAE')
print('{:0.5f}'.format(glmae))

1JHC 0.39598
1JHN 0.22932
2JHC 0.15638
2JHH 0.08843
2JHN 0.10080
3JHC 0.18470
3JHH 0.10389
3JHN 0.08475

Group LMAE
-1.92452


# Validate Sub

In [27]:
sub_good = pd.read_csv('../submissions/BLEND014_sub_-1.85550CV.csv')
final_sub['good_scc'] = sub_good['scalar_coupling_constant']
final_sub[['scalar_coupling_constant','good_scc']].corr()

Unnamed: 0,scalar_coupling_constant,good_scc
scalar_coupling_constant,1.0,0.999997
good_scc,0.999997,1.0


In [28]:
%%javascript
IPython.notebook.kernel.execute(`notebookName = '${window.document.getElementById("notebook_name").innerHTML}'`);

<IPython.core.display.Javascript object>

# Create Submission and OOF Files

In [34]:
import json
import os.path
import re
import ipykernel
import requests

#try:  # Python 3
#    from urllib.parse import urljoin
#except ImportError:  # Python 2
#    from urlparse import urljoin

# Alternative that works for both Python 2 and 3:
from requests.compat import urljoin

try:  # Python 3 (see Edit2 below for why this may not work in Python 2)
    from notebook.notebookapp import list_running_servers
except ImportError:  # Python 2
    import warnings
    from IPython.utils.shimmodule import ShimWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ShimWarning)
        from IPython.html.notebookapp import list_running_servers


def get_notebook_name():
    """
    Return the full path of the jupyter notebook.
    """
    kernel_id = re.search('kernel-(.*).json',
                          ipykernel.connect.get_connection_file()).group(1)
    servers = list_running_servers()
    for ss in servers:
        response = requests.get(urljoin(ss['url'], 'api/sessions'),
                                params={'token': ss.get('token', '')})
        for nn in json.loads(response.text):
            if nn['kernel']['id'] == kernel_id:
                relative_path = nn['notebook']['path']
                return os.path.join(ss['notebook_dir'], relative_path)

In [39]:
BLEND_NUMBER = get_notebook_name().split('/')[-1].replace('.ipynb','').replace('-','')
print(f'NAME TO SAVE {BLEND_NUMBER}')

NAME TO SAVE BLEND018


In [41]:
# Save the Results
final_sub[['id','scalar_coupling_constant']].to_csv(f'../submissions/{BLEND_NUMBER}_sub_{glmae:0.5f}CV.csv', index=False)
final_oof.to_csv(f'../oof/{BLEND_NUMBER}_oof_{glmae:0.5f}CV.csv', index=False)