# QML Hackathon 



In [3]:
% matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import os

import pandas as pd
import numpy as np

In [5]:
test = pd.read_csv('./data/test.csv')
test_id = test.id

train = pd.read_csv('./data/train.csv')

train.rename(columns={
    'spacegroup' : 'sg',
    'number_of_total_atoms' : 'Natoms',
    'percent_atom_al' : 'x_Al',
    'percent_atom_ga' : 'x_Ga',
    'percent_atom_in' : 'x_In',
    'lattice_vector_1_ang' : 'a',
    'lattice_vector_2_ang' : 'b',
    'lattice_vector_3_ang' : 'c',
    'lattice_angle_alpha_degree' : 'alpha',
    'lattice_angle_beta_degree' : 'beta',
    'lattice_angle_gamma_degree' : 'gamma',
    'formation_energy_ev_natom' : 'E',
    'bandgap_energy_ev' : 'Eg'}, inplace=True)

test.rename(columns={
    'spacegroup' : 'sg',
    'number_of_total_atoms' : 'Natoms',
    'percent_atom_al' : 'x_Al',
    'percent_atom_ga' : 'x_Ga',
    'percent_atom_in' : 'x_In',
    'lattice_vector_1_ang' : 'a',
    'lattice_vector_2_ang' : 'b',
    'lattice_vector_3_ang' : 'c',
    'lattice_angle_alpha_degree' : 'alpha',
    'lattice_angle_beta_degree' : 'beta',
    'lattice_angle_gamma_degree' : 'gamma',
}, inplace=True)

## Load data

In [6]:
new_all_data = pd.read_csv('./data/all_data.csv')
new_all_data.columns
all_data = new_all_data
train.columns

Index(['id', 'sg', 'Natoms', 'x_Al', 'x_Ga', 'x_In', 'a', 'b', 'c', 'alpha',
       'beta', 'gamma', 'E', 'Eg'],
      dtype='object')

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# features to use
features = ['x_Al', 'x_Ga', 'x_In', 'a', 'b', 'c', 'alpha', 'beta',
            'gamma', 'vol', 'atomic_density', 'x_Al_avg','x_Ga_avg', 'x_In_avg', 'a_avg',
            'b_avg', 'c_avg', 'vol_avg', 'atomic_density_avg', 'pca_abc', 'pca_AlGaInDensity',
            'O_0_0','O_1_0', 'O_2_0', 'O_3_0', 'O_4_0', 'O_5_0', 'Al_0_0', 'Al_1_0', 'Al_2_0', 'Al_3_0', 'Al_4_0', 'Al_5_0', 'Ga_0_0',
            'Ga_1_0', 'Ga_2_0', 'Ga_3_0', 'Ga_4_0', 'Ga_5_0', 'In_0_0', 'In_1_0',
            'In_2_0', 'In_3_0', 'In_4_0', 'In_5_0',]

# two different vectors for pca
vector1 = all_data[['a', 'b', 'c']].values
vector2 = all_data[['x_Al', 'x_Ga', 'x_In', 'atomic_density_avg']].values

# use pca to add new features
pca = PCA()
pca.fit(vector1)
all_data['pca_abc'] = pca.transform(vector1)[:,0]

pca = PCA()
pca.fit(vector2)
all_data['pca_AlGaInDensity'] = pca.transform(vector2)[:,0]

# scaling the data. Linear models tend to like more normally distributed
# I tried training on non-scaled, with slightly worse results
scale = StandardScaler()
scaled = scale.fit(all_data[features]).transform(all_data[features])

X_scale = scaled[:train.shape[0]]
X_scaled_test = scaled[train.shape[0]:]

X_tr = all_data[:train.shape[0]][features].values
X_te = all_data[train.shape[0]:][features].values

y1 = np.log1p(train['E'])
y2 = np.log1p(train['Eg'])

y12 = np.column_stack((y1, y2))

X_tr.shape, y1.shape, y2.shape, y12.shape, X_scaled_test.shape



((2400, 45), (2400,), (2400,), (2400, 2), (600, 45))

## Preformance Metric

In [8]:
# performance matric
def rmsle(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y

    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    
#     h, y = np.expm1(h), np.expm1(y)
    
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

## Gradient Boosting

In [9]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
# run different model for different Target Variables

grad_1 = GradientBoostingRegressor(
                loss='ls',
                learning_rate = 0.0035,
                max_depth=7,
                n_estimators=1120,
                max_features=7,
                min_samples_leaf=43,
                min_samples_split=14,
                min_weight_fraction_leaf=0.01556)

grad_2 = GradientBoostingRegressor(
                loss='ls',
                learning_rate = 0.0035,
                max_depth=6,
                n_estimators=3275,
                max_features=2,
                min_samples_leaf=2,
                min_samples_split=2,
                min_weight_fraction_leaf=0.08012)

def assess_grad(X, y_list, model_list):
    """ Used to access model performance. Returns the mean rmsle score of cross validated data
    """
    final = []
    best_iter = [[], []]
    for idx, y in enumerate(y_list):
        kfold = KFold(n_splits=10, shuffle=True)
        out = []
        for train_index, test_index in kfold.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = model_list[idx]
            model.fit(X_train, y_train)
            h =  model.predict(X_test)
            e = rmsle(np.expm1(h), np.expm1(y_test))
            print(e)
            out.append(e)
        final.append(np.array(out).mean())
                      
    return(np.array(final).mean(), np.array(final).std())

In [11]:
model = assess_grad(X_tr, [y1, y2], [grad_1, grad_2])
print("Model RMSLE: {}, std: {}".format(model[0], model[1]))

0.031235830292461682
0.030213516769038162
0.03733221272908664
0.03709529431758591
0.0345533741913991
0.030420868905986463
0.03491026564659911
0.028142406384270787
0.03277225539180304
0.029440832491352067
0.08524644686741777
0.0895614640931458
0.08201402621060525
0.0803850039770066
0.08982694365722622
0.09538026848156189
0.09421868598431501
0.10259336822400718
0.10205821036502735
0.08889803711063618
Model RMSLE: 0.061814965604526616, std: 0.029203279892568317


In [12]:
from xgboost import XGBRegressor
xgb_1 = XGBRegressor(
    learning_rate=0.005,
    n_jobs=3,
    n_estimators= 1804,
    gamma= 0.0,
    subsample= 0.222159,
    colsample_bytree= 0.5359,
    colsample_bylevel= 0.19958,
    max_delta_step= 64,
    max_depth=28,
    min_child_weight= 10,
    reg_lambda=0.33038,
    silent= True,
)

xgb_2 = XGBRegressor(
    learning_rate=0.005,
    n_jobs=3,
    n_estimators= 2386,
    gamma= 0.0,
    subsample= 0.90919,
    colsample_bytree= 0.59049,
    colsample_bylevel= 0.59404,
    max_delta_step= 99,
    max_depth=58,
    min_child_weight= 85,
    reg_lambda= 0.031165789070644215,
    silent= True,
)
def assess_xgb(X, y_list, model_num):
    """ Used to access model performance. Returns the mean rmsle score of cross validated data
    """
    final = []
    best_iter = [[], []]
    for idx, y in enumerate(y_list):
        kfold = KFold(n_splits=10, shuffle=True)
        out = []
        for train_index, test_index in kfold.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = model_num[idx]
            model.fit(X_train, y_train)
            h =  model.predict(X_test)
            e = rmsle(np.expm1(h), np.expm1(y_test))
            print('RMSLE: {}'.format(e))
            out.append(e)
        final.append(np.array(out).mean())
    return(np.array(final).mean(), np.array(final).std())

model = assess_xgb(X_tr, [y1, y2], [xgb_1, xgb_2])
print("Model RMSLE: {}, std: {}".format(model[0], model[1]))

RMSLE: 0.03522487062519504
RMSLE: 0.030835353908031056
RMSLE: 0.03797183373498328
RMSLE: 0.028746763609099585
RMSLE: 0.03291541187926484
RMSLE: 0.03539575577980152
RMSLE: 0.03305602119743682
RMSLE: 0.0328463260280417
RMSLE: 0.02685749455724478
RMSLE: 0.02984019691176692
RMSLE: 0.08246473826912223
RMSLE: 0.08031301343166705
RMSLE: 0.08314740596951575
RMSLE: 0.08390854213531491
RMSLE: 0.1008357469735944
RMSLE: 0.09576484297164943
RMSLE: 0.08011960504208634
RMSLE: 0.08718077261497703
RMSLE: 0.08297010778211764
RMSLE: 0.07966571893585024
Model RMSLE: 0.05900302611783803, std: 0.026634023294751476


In [13]:
from catboost import CatBoostRegressor



# I found these parameterw worked for both y variables
cat_1 = CatBoostRegressor(iterations=2300,
                          learning_rate=0.020,
                          depth=5,
                          loss_function='RMSE',
                          eval_metric='RMSE',
                          od_type='Iter',
                          od_wait=50,
                         )

def assess_cat(X, y_list, model_num):
    """ Used to access model performance. Returns the mean rmsle score of cross validated data
    """
    final = []
    best_iter = [[], []]
    for idx, y in enumerate(y_list):
        kfold = KFold(n_splits=10, shuffle=True)
        out = []
        for train_index, test_index in kfold.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = model_num[idx]
            model.fit(X_train, y_train, verbose=False)
            h =  model.predict(X_test)
            e = rmsle(np.expm1(h), np.expm1(y_test))
            print('RMSLE: {}'.format(e))
            out.append(e)
        final.append(np.array(out).mean())
    return(np.array(final).mean(), np.array(final).std())

In [14]:
model = assess_cat(X_tr, [y1, y2], [cat_1, cat_1])
print("Model RMSLE: {}, std: {}".format(model[0], model[1]))

RMSLE: 0.03265148892049478
RMSLE: 0.032966912284922316
RMSLE: 0.03270096213658705
RMSLE: 0.03609359217071158
RMSLE: 0.028385102571945736
RMSLE: 0.03069123434926073
RMSLE: 0.03827922222160731
RMSLE: 0.028664024544775023
RMSLE: 0.036294420153251586
RMSLE: 0.027907790008408744
RMSLE: 0.10289044143355489
RMSLE: 0.0784031133204579
RMSLE: 0.09678459741400268
RMSLE: 0.09154133403194552
RMSLE: 0.0977357574782428
RMSLE: 0.08557513678252765
RMSLE: 0.07492530340325988
RMSLE: 0.08571380810644491
RMSLE: 0.08211000238925993
RMSLE: 0.0754606515438452
Model RMSLE: 0.059788744763275316, std: 0.02732526982707883


In [15]:
from sklearn.model_selection import train_test_split

catboost_cv = CatBoostRegressor(iterations=1200,
                            learning_rate=0.03,
                            depth=5,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            random_seed=99,
                            od_type='Iter',
                            od_wait=50)
    
def assess_cv_catboost(X, y_list):
    """ Used to access model performance. Returns the mean rmsle score of cross validated data
    """
    final = []
    best_iter = [[], []]
    for idx, y in enumerate(y_list):
        kfold = KFold(n_splits=10, shuffle=True)
        out = []
        for train_index, test_index in kfold.split(X):
            # splitting the data up into train, test, and valid sets
            X_iter, X_test = X[train_index], X[test_index]
            y_iter, y_test = y[train_index], y[test_index]
            X_train, X_valid, y_train, y_valid = train_test_split(X_iter, y_iter, test_size=0.3)
            model =  catboost_cv
            model.fit(X_train, y_train,
                      eval_set=(X_valid, y_valid),
                      use_best_model=True,
                      verbose=False)
            h =  model.predict(X_test)
            e = rmsle(np.expm1(h), np.expm1(y_test))
            print('RMSLE: {}'.format(e))
            out.append(e)
        final.append(np.array(out).mean())
    return(np.array(final).mean(), np.array(final).std())

In [16]:
model = assess_cv_catboost(X_tr, [y1, y2])
print("Model RMSLE: {}, std: {}".format(model[0], model[1]))

RMSLE: 0.03506162050857025
RMSLE: 0.03577422553226561
RMSLE: 0.030901495220155003
RMSLE: 0.03254313256986995
RMSLE: 0.03161603188585777
RMSLE: 0.027491057083018765
RMSLE: 0.03207401960035554
RMSLE: 0.029544733541181572
RMSLE: 0.03383616987424214
RMSLE: 0.03538066670311648
RMSLE: 0.07930661019841984
RMSLE: 0.09894086270512083
RMSLE: 0.08022644595999424
RMSLE: 0.08160873103687519
RMSLE: 0.08223125819927198
RMSLE: 0.0867955337440799
RMSLE: 0.09383290322618729
RMSLE: 0.09923622601054577
RMSLE: 0.08611985621795983
RMSLE: 0.08554798098910255
Model RMSLE: 0.05990347804030953, std: 0.027481162788446218
