# BLEND051
- Add keras model

In [2]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

def log_mean_absolute_error(y_true, y_pred):
    return np.log(mean_absolute_error(y_true, y_pred))
%matplotlib inline

In [3]:
types = ['1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']

def get_sub_oof(t):
    """
    Grab the sub and oof files for a give type. 
    """
    oof = []
    sub = []
    model_ids = []
    for file in sorted(os.listdir(f'../type_results/{t}/')):
        if ('sub' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            model_ids.append(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            df.index = df.id.values
            if len(sub) == 0:
                sub = df.rename(columns={'scalar_coupling_constant': model_id})
            else:
                sub[model_id] = df['scalar_coupling_constant']
        elif ('oof' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            #print(model_id)
            df = pd.read_parquet(f'../type_results/{t}/{file}')
            df.index = df.id.values
            if len(oof) == 0:
                oof = df.rename(columns={'oof_preds': model_id})
            else:
                oof[model_id] = df['oof_preds']
    for file in sorted(os.listdir(f'../type_results/{t}/old/')):
        if ('sub' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            model_ids.append(model_id)
            df = pd.read_parquet(f'../type_results/{t}/old/{file}')
            df.index = df.id.values
            if len(sub) == 0:
                sub = df.rename(columns={'scalar_coupling_constant': model_id})
            else:
                sub[model_id] = df['scalar_coupling_constant']
        elif ('oof' in file) and ('3folds' in file):
            model_id = file[:4] + file.split('MAE_')[1].replace('L','')
            #print(model_id)
            df = pd.read_parquet(f'../type_results/{t}/old/{file}')
            df.index = df.id.values
            if len(oof) == 0:
                oof = df.rename(columns={'oof_preds': model_id})
            else:
                oof[model_id] = df['oof_preds']

    sub = sub.loc[sub['type'] == t]
    oof = oof.loc[oof['type'] == t]
    return sub, oof, model_ids

def get_best_weights(oof, l5, t):
    if len(l5) == 1:
        return [1]
    lmae_best = 0
    for x in tqdm(range(0, 10000)):
        ws = []
        for x in range(0, len(l5)):
            if x == len(l5)-1:
                ws.append(random.randint(1, 20))
            else:
                ws.append(random.randint(0, 20))
        idx = 0
        blend = np.zeros(len(oof[l5[0]]))
        for w in ws:
            blend += w * oof[l5[idx]]
            idx += 1
        blend = blend / np.sum(ws)
        #print(blend.head())
        lmae = log_mean_absolute_error(oof['scalar_coupling_constant'],
                            blend)
        if lmae < lmae_best:
            lmae_best = lmae
            bws = ws
    print(f'{t}: ===== Best score {lmae_best:0.5f} - with weights {bws}')
    return bws # Return the best weights

def create_best_blends():
    oofs = []
    subs = []
    for t in types:
        print(f'{t}: Running for type {t}')
        tsub, toof, tmodel_ids = get_sub_oof(t)
        print(f'{t}: Has model names {tmodel_ids}')
        #l5 = tmodel_ids[-5:] # Last 5 models
        bws = get_best_weights(toof, tmodel_ids, t)
        tot_weight = np.sum(bws)
        tsub['scalar_coupling_constant'] = 0
        toof['oof_blend'] = 0
        idx = 0
        for w in bws:
            tsub['scalar_coupling_constant'] += (w * tsub[tmodel_ids[idx]])
            toof['oof_blend'] += (w * toof[tmodel_ids[idx]])
            idx += 1
        tsub['scalar_coupling_constant'] /= tot_weight
        toof['oof_blend'] /= tot_weight
        tsub = tsub[['id','type','scalar_coupling_constant']].copy()
        toof = toof[['id','type','scalar_coupling_constant','oof_blend']].copy()
        subs.append(tsub)
        oofs.append(toof)
    final_oof = pd.concat(oofs).sort_values('id').reset_index(drop=True)
    final_sub = pd.concat(subs).sort_values('id').reset_index(drop=True)
    return final_oof, final_sub

final_oof, final_sub = create_best_blends()
print('Done')

1JHC: Running for type 1JHC


  0%|          | 1/10000 [00:00<18:47,  8.87it/s]

1JHC: Has model names ['K004-0.7783', 'M048-0.8258', 'M053-0.8639', 'M055-0.8641', 'M058-0.8972', 'M059-0.9326', 'M047-0.7977', 'M049-0.791']


100%|██████████| 10000/10000 [04:46<00:00, 34.74it/s]


1JHC: ===== Best score -1.00995 - with weights [1, 6, 10, 9, 1, 17, 11, 9]
1JHN: Running for type 1JHN


  0%|          | 7/10000 [00:00<02:22, 69.97it/s]

1JHN: Has model names ['K004-1.214', 'M049-1.361', 'M056-1.424', 'M058-1.214', 'M501-1.424', 'M502-1.46', 'M047-1.116', 'M053-1.265', 'M055-1.26']


100%|██████████| 10000/10000 [02:22<00:00, 69.99it/s]


1JHN: ===== Best score -1.63135 - with weights [7, 7, 15, 2, 10, 15, 2, 5, 4]
2JHC: Running for type 2JHC


  0%|          | 3/10000 [00:00<06:09, 27.07it/s]

2JHC: Has model names ['K004-1.668', 'M047-1.75', 'M048-1.737', 'M049-1.723', 'M055-1.803', 'M501-1.837', 'M054-1.443']


100%|██████████| 10000/10000 [05:59<00:00, 27.85it/s]


2JHC: ===== Best score -1.93970 - with weights [2, 13, 2, 9, 18, 18, 1]
2JHH: Running for type 2JHH


  0%|          | 4/10000 [00:00<05:00, 33.29it/s]

2JHH: Has model names ['K004-2.145', 'M048-2.282', 'M052-2.301', 'M055-2.355', 'M058-2.42', 'M059-2.458', 'M501-2.377', 'M031-1.869', 'M032-1.824', 'M032-1.823', 'M047-2.209', 'M054-2.121']


100%|██████████| 10000/10000 [04:52<00:00, 34.66it/s]


2JHH: ===== Best score -2.55182 - with weights [1, 7, 1, 4, 18, 20, 19, 0, 1, 0, 13, 1]
2JHN: Running for type 2JHN


  0%|          | 5/10000 [00:00<03:37, 45.90it/s]

2JHN: Has model names ['K004-2.105', 'M053-2.118', 'M055-2.197', 'M056-2.173', 'M501-2.246', 'M031-2.04', 'M032-1.962', 'M047-2.12', 'M048-2.082', 'M049-2.117', 'M050-2.09', 'M054-2.075']


100%|██████████| 10000/10000 [03:29<00:00, 47.68it/s]


2JHN: ===== Best score -2.40411 - with weights [11, 11, 18, 2, 20, 15, 3, 3, 6, 2, 15, 3]
3JHC: Running for type 3JHC


  0%|          | 4/10000 [00:00<04:56, 33.69it/s]

3JHC: Has model names ['M047-1.612', 'M049-1.606', 'M055-1.7', 'M501-1.753', 'M054-1.322']


100%|██████████| 10000/10000 [04:59<00:00, 30.61it/s]


3JHC: ===== Best score -1.83560 - with weights [6, 9, 13, 20, 1]
3JHH: Running for type 3JHH


  0%|          | 4/10000 [00:00<04:30, 36.93it/s]

3JHH: Has model names ['K004-1.937', 'M047-2.1', 'M053-2.1', 'M055-2.208', 'M501-2.257', 'M048-2.086', 'M049-2.098', 'M054-1.905']


100%|██████████| 10000/10000 [04:17<00:00, 38.93it/s]


3JHH: ===== Best score -2.36420 - with weights [5, 2, 4, 11, 18, 3, 6, 1]
3JHN: Running for type 3JHN


  0%|          | 6/10000 [00:00<02:48, 59.46it/s]

3JHN: Has model names ['K004-2.284', 'M050-2.383', 'M052-2.39', 'M056-2.4', 'M501-2.416', 'M047-2.356', 'M048-2.321', 'M054-2.277', 'M055-2.363']


100%|██████████| 10000/10000 [02:57<00:00, 56.30it/s]


3JHN: ===== Best score -2.57323 - with weights [7, 17, 0, 14, 19, 6, 1, 0, 6]
Done


## Score

In [5]:
for i, d in final_oof.groupby('type'):
    score = mean_absolute_error(d['scalar_coupling_constant'], d['oof_blend'])
    lscore = np.log(mean_absolute_error(d['scalar_coupling_constant'], d['oof_blend']))
    print(i,'\t {:0.5f} \t {:0.5f}'.format(score, lscore))
glmae = group_mean_log_mae(final_oof['scalar_coupling_constant'], final_oof['oof_blend'], final_oof['type'])
print('\nGroup LMAE')
print('{:0.5f}'.format(glmae))

1JHC 	 0.36424 	 -1.00995
1JHN 	 0.19566 	 -1.63135
2JHC 	 0.14375 	 -1.93970
2JHH 	 0.07794 	 -2.55182
2JHN 	 0.09035 	 -2.40411
3JHC 	 0.15952 	 -1.83560
3JHH 	 0.09402 	 -2.36420
3JHN 	 0.07629 	 -2.57323

Group LMAE
-2.03874


# Validate Sub

In [8]:
sub_good = pd.read_csv('../submissions/BLEND044_sub_-2.03355CV.csv')
final_sub['good_scc'] = sub_good['scalar_coupling_constant']
final_sub[['scalar_coupling_constant','good_scc']].corr()

Unnamed: 0,scalar_coupling_constant,good_scc
scalar_coupling_constant,1.0,1.0
good_scc,1.0,1.0


In [9]:
print('MAE vs Good Sub')
for i, d in final_sub.groupby('type'):
    mae_t = mean_absolute_error(d['good_scc'], d['scalar_coupling_constant'])
    corr_t = np.corrcoef(d['good_scc'], d['scalar_coupling_constant'])[1][0]
    print(f'{i} - {mae_t:0.5f} - {corr_t:0.5f}')

MAE vs Good Sub
1JHC - 0.00886 - 1.00000
1JHN - 0.02600 - 0.99999
2JHC - 0.00899 - 0.99999
2JHH - 0.01232 - 0.99999
2JHN - 0.00865 - 0.99999
3JHC - 0.00654 - 0.99999
3JHH - 0.00997 - 0.99999
3JHN - 0.00476 - 0.99998


# Create Submission and OOF Files

In [10]:
import json
import os.path
import re
import ipykernel
import requests

#try:  # Python 3
#    from urllib.parse import urljoin
#except ImportError:  # Python 2
#    from urlparse import urljoin

# Alternative that works for both Python 2 and 3:
from requests.compat import urljoin

try:  # Python 3 (see Edit2 below for why this may not work in Python 2)
    from notebook.notebookapp import list_running_servers
except ImportError:  # Python 2
    import warnings
    from IPython.utils.shimmodule import ShimWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ShimWarning)
        from IPython.html.notebookapp import list_running_servers


def get_notebook_name():
    """
    Return the full path of the jupyter notebook.
    """
    kernel_id = re.search('kernel-(.*).json',
                          ipykernel.connect.get_connection_file()).group(1)
    servers = list_running_servers()
    for ss in servers:
        response = requests.get(urljoin(ss['url'], 'api/sessions'),
                                params={'token': ss.get('token', '')})
        for nn in json.loads(response.text):
            if nn['kernel']['id'] == kernel_id:
                relative_path = nn['notebook']['path']
                return os.path.join(ss['notebook_dir'], relative_path)

In [12]:
BLEND_NUMBER = get_notebook_name().split('/')[-1].replace('.ipynb','').replace('-','')
print(f'NAME TO SAVE {BLEND_NUMBER}')

NAME TO SAVE BLEND051


In [13]:
# Save the Results
final_sub[['id','scalar_coupling_constant']].to_csv(f'../submissions/{BLEND_NUMBER}_sub_{glmae:0.5f}CV.csv', index=False)
final_oof.to_csv(f'../oof/{BLEND_NUMBER}_oof_{glmae:0.5f}CV.csv', index=False)