In [1]:
import pandas as pd
import logging
import sys

In [2]:
df_struct=pd.read_csv('../input/structures.csv')
df_train_sub_charge=pd.read_csv('../input/mulliken_charges.csv')
df_train_sub_tensor=pd.read_csv('../input/magnetic_shielding_tensors.csv')
train = pd.read_csv('../input/train.csv')

In [3]:
def create_datasets(bond_type, filename_start, filename_end, fold):
    logger.info('Creating Datasets')
    # Read input for first fold
    X_train = pd.read_parquet(f'../type_results/{bond_type}/meta/{filename_start}_X_train_meta_fc_f{fold}{filename_end}') 
    X_test = pd.read_parquet(f'../type_results/{bond_type}/meta/{filename_start}_X_test_meta_fc_f{fold}{filename_end}') 
    X_valid = pd.read_parquet(f'../type_results/{bond_type}/meta/{filename_start}_X_valid_meta_fc_f{fold}{filename_end}') 
    X_train['split'] = 'TRAIN'
    X_test['split'] = 'TEST'
    X_valid['split'] = 'VALID'
    logger.info('Adding target to dataset')
    # Add target to train and val
    X_tr_val = pd.concat([X_train, X_valid])
    X_tr_val = X_tr_val.sort_index()
    X_tr_val['scalar_coupling_constant'] = train.loc[train['type'] == '3JHH']['scalar_coupling_constant'].tolist()
    X_tr_val['molecule_name'] = train.loc[train['type'] == '3JHH']['molecule_name'].tolist()
    X_tr_val['atom_index_0'] = train.loc[train['type'] == '3JHH']['atom_index_0'].tolist()
    X_tr_val['atom_index_1'] = train.loc[train['type'] == '3JHH']['atom_index_1'].tolist()

    # Combine all
    X_all = pd.concat([X_tr_val, X_test])
    logger.info('Adding custom target features')
    for atom_idx in [0,1]:
        X_all = map_atom_info(X_all,df_struct, atom_idx)
        X_all = map_atom_info(X_all,df_train_sub_charge, atom_idx)
        X_all = map_atom_info(X_all,df_train_sub_tensor, atom_idx)
        X_all = X_all.rename(columns={'atom': f'atom_{atom_idx}',
                                            'x': f'x_{atom_idx}',
                                            'y': f'y_{atom_idx}',
                                            'z': f'z_{atom_idx}',
                                            'mulliken_charge': f'charge_{atom_idx}',
                                            'XX': f'XX_{atom_idx}',
                                            'YX': f'YX_{atom_idx}',
                                            'ZX': f'ZX_{atom_idx}',
                                            'XY': f'XY_{atom_idx}',
                                            'YY': f'YY_{atom_idx}',
                                            'ZY': f'ZY_{atom_idx}',
                                            'XZ': f'XZ_{atom_idx}',
                                            'YZ': f'YZ_{atom_idx}',
                                            'ZZ': f'ZZ_{atom_idx}',})
    
    ys_all = X_all[['scalar_coupling_constant',"charge_0","charge_1",
                "XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1","YX_0","ZX_0",
                "XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1",
                "XZ_1","YZ_1"]]
    split_all = X_all['split']
    
    X_all = X_all.drop(['scalar_coupling_constant',"charge_0","charge_1",
                "XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1","YX_0","ZX_0",
                "XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1",
                "XZ_1","YZ_1"], axis=1)
    
    #Impute NA with mean
    # THIS PART TAKES A LONG TIME
    logger.info('Filling in NA vaules with the mean value (this can take some time.....)')
    from sklearn.impute import SimpleImputer
    X_all = SimpleImputer().fit_transform(X_all) 
#     MEAN = X_all.mean()
#     X_all.fillna( value=MEAN, inplace=True )
    
    splits = X_all['split']
    target_all = ys_all['scalar_coupling_constant']
    X_all = X_all.drop('split', axis=1)
    
    X_all = X_all.drop(['atom_0','atom_1','molecule_name'], axis=1)
    
    # STANDARD SCALAR STUFF
    logger.info('Applying Standard scalar to data')
    X_all[X_all.columns] = StandardScaler().fit_transform(X_all[X_all.columns])
    ys_all[ys_all.columns] = StandardScaler().fit_transform(ys_all[ys_all.columns])
    
    X_train = X_all.loc[splits == 'TRAIN']
    X_valid = X_all.loc[splits == 'VALID']
    X_test = X_all.loc[splits == 'TEST']
    
    y_train = ys_all.loc[splits == 'TRAIN']
    y_valid = ys_all.loc[splits == 'VALID']
    y_test = ys_all.loc[splits == 'TEST']
    
    target_train = target_all[splits == 'TRAIN']
    target_valid = target_all[splits == 'VALID']
    target_test = target_all[splits == 'TEST']
    
    m1=2
    m2=4
    m3=1

    train_input=X_train.values
    cv_input=X_valid.values
    train_target=target_train.values
    cv_target=target_valid.values
    train_target_1=m1 * y_train[["charge_0","charge_1"]].values
    cv_target_1=m1 * y_valid[["charge_0","charge_1"]].values
    train_target_2=m2 * y_train[["XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1"]].values
    cv_target_2=m2 * y_valid[["XX_0","YY_0","ZZ_0","XX_1","YY_1","ZZ_1"]].values
    train_target_3=m3 * y_train[["YX_0","ZX_0","XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1","XZ_1","YZ_1"]].values
    cv_target_3=m3 * y_valid[["YX_0","ZX_0","XY_0","ZY_0","XZ_0","YZ_0","YX_1","ZX_1","XY_1","ZY_1","XZ_1","YZ_1"]].values
    test_input=X_test.values
    logger.info('Done creating data for model')
    return train_input, cv_input, train_target, cv_target, train_target_1, cv_target_1, train_target_2, cv_target_2, train_target_3, cv_target_3, test_input

In [4]:
def get_logger():
    """
        credits to: https://www.kaggle.com/ogrellier/user-level-lightgbm-lb-1-4480
    """
#     os.environ["TZ"] = "US/Eastern"
#     time.tzset()
    FORMAT = "[%(asctime)s] %(levelname)s : %(message)s"
    logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)
#     logging.basicConfig(format=FORMAT)
    logger = logging.getLogger("main")
    logger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(FORMAT)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

logger = get_logger()

In [5]:
filename_start = 'M053_0725_0821_3JHH'
filename_end = '_0.1753MAE_-1.7413LMAE.parquet'
bond_type = '3JHH'
fold = 1

train_input, cv_input, train_target, cv_target, train_target_1, cv_target_1, train_target_2, cv_target_2, train_target_3, cv_target_3, test_input = create_datasets(bond_type, filename_start, filename_end, fold)


[2019-07-31 18:12:49,619] INFO : Creating Datasets
2019-07-31 18:12:49,619 | INFO : Creating Datasets


  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


[2019-07-31 18:12:51,358] INFO : Adding target to dataset
2019-07-31 18:12:51,358 | INFO : Adding target to dataset


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




[2019-07-31 18:12:57,606] INFO : Adding custom target features
2019-07-31 18:12:57,606 | INFO : Adding custom target features


NameError: name 'map_atom_info' is not defined