# QML Hackathon 



In [27]:
% matplotlib inline

import matplotlib.pyplot as plt

import os

import pandas as pd
import numpy as np
import seaborn as sns

In [28]:
test = pd.read_csv('./data/test.csv')
test_id = test.id

train = pd.read_csv('./data/train.csv')

## Load data

In [72]:
new_all_data = pd.read_csv('all_data.csv')
new_all_data.columns

Index([u'Unnamed: 0', u'E', u'Eg', u'a', u'alpha', u'b', u'beta', u'c',
       u'gamma', u'id', u'x_Al', u'x_Ga', u'x_In', u'avg_rs_max',
       u'avg_electronegativity', u'avg_rp_max', u'avg_LUMO', u'avg_IP',
       u'avg_rd_max', u'avg_EA', u'avg_HOMO', u'avg_mass', u'x_Al_avg',
       u'x_Ga_avg', u'x_In_avg', u'a_avg', u'b_avg', u'c_avg',
       u'avg_rs_max_avg', u'avg_electronegativity_avg', u'avg_rp_max_avg',
       u'avg_LUMO_avg', u'avg_IP_avg', u'avg_rd_max_avg', u'avg_EA_avg',
       u'avg_HOMO_avg', u'avg_mass_avg', u'vol_avg', u'alpha_r', u'beta_r',
       u'gamma_r', u'vol', u'atomic_density', u'atomic_density_avg', u'Al_0_0',
       u'Al_1_0', u'Al_2_0', u'Al_3_0', u'Al_4_0', u'Al_5_0', u'Ga_0_0',
       u'Ga_1_0', u'Ga_2_0', u'Ga_3_0', u'Ga_4_0', u'Ga_5_0', u'In_0_0',
       u'In_1_0', u'In_2_0', u'In_3_0', u'In_4_0', u'In_5_0', u'Natoms_10.0',
       u'Natoms_20.0', u'Natoms_30.0', u'Natoms_40.0', u'Natoms_60.0',
       u'Natoms_80.0', u'O_0_0', u'O_1_0', u'O_2_0', u'O

In [73]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# features to use
features = ['x_Al', 'x_Ga', 'x_In', 'a', 'b', 'c', 'alpha', 'beta',
            'gamma', 'vol', 'atomic_density', 'x_Al_avg','x_Ga_avg', 'x_In_avg', 'a_avg',
            'b_avg', 'c_avg', 'vol_avg', 'atomic_density_avg', 'pca_abc', 'pca_AlGaInDensity',
            'O_0_0','O_1_0', 'O_2_0', 'O_3_0', 'O_4_0', 'O_5_0', 'Al_0_0', 'Al_1_0', 'Al_2_0', 'Al_3_0', 'Al_4_0', 'Al_5_0', 'Ga_0_0',
            'Ga_1_0', 'Ga_2_0', 'Ga_3_0', 'Ga_4_0', 'Ga_5_0', 'In_0_0', 'In_1_0',
            'In_2_0', 'In_3_0', 'In_4_0', 'In_5_0',]

# two different vectors for pca
vector1 = all_data[['a', 'b', 'c']].values
vector2 = all_data[['x_Al', 'x_Ga', 'x_In', 'atomic_density_avg']].values

# use pca to add new features
pca = PCA()
pca.fit(vector1)
all_data['pca_abc'] = pca.transform(vector1)[:,0]

pca = PCA()
pca.fit(vector2)
all_data['pca_AlGaInDensity'] = pca.transform(vector2)[:,0]

# scaling the data. Linear models tend to like more normally distributed
# I tried training on non-scaled, with slightly worse results
scale = StandardScaler()
scaled = scale.fit(all_data[features]).transform(all_data[features])

X_scale = scaled[:train.shape[0]]
X_scaled_test = scaled[train.shape[0]:]

X_tr = all_data[:train.shape[0]][features].values
X_te = all_data[train.shape[0]:][features].values

y1 = np.log1p(train['E'])
y2 = np.log1p(train['Eg'])

y12 = np.column_stack((y1, y2))

X_tr.shape, y1.shape, y2.shape, y12.shape, X_scaled_test.shape



((2400, 45), (2400,), (2400,), (2400, 2), (600, 45))

## Preformance Metric

In [74]:
# performance matric
def rmsle(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y

    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    
#     h, y = np.expm1(h), np.expm1(y)
    
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

## Gradient Boosting

In [75]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [76]:
# run different model for different Target Variables

grad_1 = GradientBoostingRegressor(
                loss='ls',
                learning_rate = 0.0035,
                max_depth=7,
                n_estimators=1120,
                max_features=7,
                min_samples_leaf=43,
                min_samples_split=14,
                min_weight_fraction_leaf=0.01556)

grad_2 = GradientBoostingRegressor(
                loss='ls',
                learning_rate = 0.0035,
                max_depth=6,
                n_estimators=3275,
                max_features=2,
                min_samples_leaf=2,
                min_samples_split=2,
                min_weight_fraction_leaf=0.08012)

def assess_grad(X, y_list, model_list):
    """ Used to access model performance. Returns the mean rmsle score of cross validated data
    """
    final = []
    best_iter = [[], []]
    for idx, y in enumerate(y_list):
        kfold = KFold(n_splits=10, shuffle=True)
        out = []
        for train_index, test_index in kfold.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model = model_list[idx]
            model.fit(X_train, y_train)
            h =  model.predict(X_test)
            e = rmsle(np.expm1(h), np.expm1(y_test))
            print(e)
            out.append(e)
        final.append(np.array(out).mean())
                      
    return(np.array(final).mean(), np.array(final).std())

In [77]:
model = assess_grad(X_tr, [y1, y2], [grad_1, grad_2])
print("Model RMSLE: {}, std: {}".format(model[0], model[1]))

0.03711007430767033
0.03233878998633526
0.02968880176799654
0.03175961694995181
0.03373580313148603
0.030356804408733874
0.026152474625486935
0.03277486103052283
0.036101985420334784
0.03506747847915937
0.09423129499402542
0.09500576452238402
0.08950503984695904
0.08624886933637849
0.08438881979985043
0.10400580146274513
0.10049768957256199
0.08472378532859268
0.10305272256190769
0.06353467023091135
Model RMSLE: 0.0615140573882, std: 0.0290053883774
