In [None]:
## import

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from bayes_opt import BayesianOptimization as BO
from IPython.display import display
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import xgboost as XGB
from sklearn.ensemble import GradientBoostingRegressor as GBR
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

%matplotlib inline
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('train_new_features.csv', index_col='id')
df.head()

In [None]:
def rmsle(actual, predicted):
    return np.square(np.log(predicted + 1) - np.log(actual + 1)).mean() ** 0.5
my_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
def select_model(data, model):
    
    exclude_feature = ['sg', 'Eg', 'Ef', 'alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al',]
    features = list(data.drop(exclude_feature, axis=1))
    X = data.drop(exclude_feature, axis=1).values

    y_g = data['Eg'].values
    y_f = data['Ef'].values
    
#    print(X.shape, y_g.shape, y_f.shape)
    params = {}
    models = {}
    
    search_params_RFR = {
     "n_estimators": Integer(50, 2000),
     'max_depth': Integer(2, 40),
     'min_samples_split': Integer(2, 15),
#     'max_features': Integer(2, 20)
     }
    
    search_params_GBR = {
     'learning_rate': Real(0.01, 0.5),
     'n_estimators': Integer(1000, 4000),
     'max_depth': Integer(2, 40),
     'min_samples_split': Integer(2, 15),
     'min_samples_leaf': Integer(2, 50),
     'min_weight_fraction_leaf': Real(0., .5),
     'max_features': Integer(2, 15)
     }
    
    params['RFR'] = search_params_RFR
    params['GBR'] = search_params_GBR
    
    models['RFR'] = RFR(max_features="sqrt")
    models['GBR'] = GBR()

    opt_g = BayesSearchCV(models[model], 
                      params[model], 
                      scoring=my_score,
                      cv=5,
                      n_iter=50, 
                      n_jobs=-1, 
                      verbose=10)
    opt_g.fit(X, y_g)
    
    opt_f = BayesSearchCV(models[model], 
                      params[model], 
                      scoring=my_score,
                      cv=5,
                      n_iter=50, 
                      n_jobs=-1, 
                      verbose=10)
    opt_f.fit(X, y_f)
    
    output = {}
    res_g = pd.DataFrame(opt_g.cv_results_).sort_values(['rank_test_score', 'param_max_depth'])
    res_f = pd.DataFrame(opt_f.cv_results_).sort_values(['rank_test_score', 'param_max_depth'])
    
    output["results_g"] = res_g
    output["results_f"] = res_f
    
    output["best_params_g"] = opt_g.best_params_
    output["best_params_f"] = opt_f.best_params_
    
    output["best_g"] = opt_g.best_estimator_
    output["best_f"] = opt_f.best_estimator_
    
    output["rmsle_g"] = abs(res_g['mean_test_score'].max())
    output["std_g"] = res_g.iloc[0]['std_test_score']
    
    output["rmsle_f"] = abs(res_f['mean_test_score'].max())
    output["std_f"] = res_f.iloc[0]['std_test_score']
    
    output["avg_rmsle"] = (output["rmsle_g"] + output["rmsle_f"])/2
    
    return output

In [None]:
sg = np.sort(df['sg'].unique())
train = {}
out_rfr = {}

for s in sg:
    train[s] = df.groupby('sg').get_group(s)
    out_rfr[s] = select_model(train[s], 'RFR')
#    avg_rmsle_rfr = np.append(avg_rmsle, out_rfr[s]["avg_rmsle"]) 

In [None]:
avg_rmsle_rfr = []
for s in sg:
    avg_rmsle_rfr = np.append(avg_rmsle_rfr, out_rfr[s]["avg_rmsle"]) 

print(avg_rmsle_rfr, avg_rmsle_rfr.mean())

In [None]:
for s in sg:
    res_f = out_rfr[s]["results_f"].sort_values(['rank_test_score', 'param_n_estimators'], ascending=[True,False])
    rank_f = res_f.groupby('rank_test_score').get_group(1)
    for param in rank_f["params"]:
        print(param)
        
    res_g = out_rfr[s]["results_g"].sort_values(['rank_test_score', 'param_n_estimators'], ascending=[True,False])
    params_f = res_f["params"].iloc[0]
    params_g = res_g["params"].iloc[0]
    best_f = RFR(**params_f)
    best_g = RFR(**params_g)

In [None]:
data_test = pd.read_csv('test_new_features.csv', index_col='id')

def predict(data):    
    best_model_g = {}
    best_model_f = {}    
    train_ex_feature = ['sg', 'Eg', 'Ef', 'alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al',]
    test_ex_feature = ['sg', 'alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al']
    df_test = {}
    
    for s in sg:
        X = train[s].drop(train_ex_feature, axis=1).values
        y_g = train[s]['Eg'].values
        y_f = train[s]['Ef'].values
        
        df_test[s] = data.groupby('sg').get_group(s)
        X_test = df_test[s].drop(test_ex_feature, axis=1).values
        
        params_g = out_rfr[s]["results_g"].groupby('rank_test_score').get_group(1)
        params_f = out_rfr[s]["results_f"].groupby('rank_test_score').get_group(1)
        
        print(s)        
        y_pred_g = np.array([]).reshape(0,X_test.shape[0])
        for i, param in enumerate(params_g["params"]):
            print("g", i)
            best_g = RFR(**param).fit(X, y_g)
            y_pred_g = np.vstack([y_pred_g, best_g.predict(X_test)])
        
        display(pd.DataFrame(y_pred_g))
        df_test[s]['std_g'] = np.std(y_pred_g, axis=0)
        df_test[s]['bandgap_energy_ev'] = np.mean(y_pred_g, axis=0)
        
        
        y_pred_f = np.array([]).reshape(0,X_test.shape[0])
        for j, param in enumerate(params_f["params"]):
            print("f", j)
            best_f = RFR(**param).fit(X, y_f)
            y_pred_f = np.vstack([y_pred_f, best_f.predict(X_test)])
            
        display(pd.DataFrame(y_pred_f))
        df_test[s]['std_f'] = np.std(y_pred_f, axis=0)
        df_test[s]['formation_energy_ev_natom'] = np.mean(y_pred_f, axis=0)
        
    df_pred = pd.concat([df_test[s] for s in sg]).sort_index()
    return df_pred, df_pred[['formation_energy_ev_natom', 'bandgap_energy_ev']]
        

pred = predict(data_test)
display(pred[1])


In [None]:
train = {}
out_gbr = {}
avg_rmsle_gbr = []
for s in sg:
    train[s] = df.groupby('sg').get_group(s)
    out_gbr[s] = select_model(train[s], 'GBR')
    avg_rmsle_gbr = np.append(avg_rmsle_gbr, out_gbr[s]["avg_rmsle"]) 

In [None]:
print(avg_rmsle_gbr, avg_rmsle_gbr.mean())

In [None]:
for s in sg:
    display(out_gbr[s]["results_g"].head())
    print(s, out_gbr[s]["best_g"])

In [None]:
data_test = pd.read_csv('test_new_features.csv', index_col='id')

def predict(data):    
    best_model_g = {}
    best_model_f = {}    
    train_ex_feature = ['sg', 'Eg', 'Ef', 'alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al',]
    test_ex_feature = ['sg', 'alpha_r', 'beta_r', 'gamma_r', 'x_In', 'x_Al']
    df_test = {}
    
    for s in sg:
        X = train[s].drop(train_ex_feature, axis=1).values
        y_g = train[s]['Eg'].values
        y_f = train[s]['Ef'].values
        
        df_test[s] = data.groupby('sg').get_group(s)
        X_test = df_test[s].drop(test_ex_feature, axis=1).values
        
        params_g = out_gbr[s]["results_g"].groupby('rank_test_score').get_group(1)
        params_f = out_gbr[s]["results_f"].groupby('rank_test_score').get_group(1)
        
        print(s)        
        y_pred_g = np.array([]).reshape(0,X_test.shape[0])
        for i, param in enumerate(params_g["params"]):
            print("g", i)
            best_g = GBR(**param).fit(X, y_g)
            y_pred_g = np.vstack([y_pred_g, best_g.predict(X_test)])
        
        display(pd.DataFrame(y_pred_g))
        df_test[s]['std_g'] = np.std(y_pred_g, axis=0)
        df_test[s]['bandgap_energy_ev'] = np.mean(y_pred_g, axis=0)
             
        y_pred_f = np.array([]).reshape(0,X_test.shape[0])
        for j, param in enumerate(params_f["params"]):
            print("f", j)
            best_f = GBR(**param).fit(X, y_f)
            y_pred_f = np.vstack([y_pred_f, best_f.predict(X_test)])
            
        display(pd.DataFrame(y_pred_f))
        df_test[s]['std_f'] = np.std(y_pred_f, axis=0)
        df_test[s]['formation_energy_ev_natom'] = np.mean(y_pred_f, axis=0)
        
    df_pred = pd.concat([df_test[s] for s in sg]).sort_index()
    return df_pred, df_pred[['formation_energy_ev_natom', 'bandgap_energy_ev']]
        

pred = predict(data_test)
display(pred[1])


In [None]:
val = np.mean(val_g.reshape(6,10), axis=1)
print(val)
avg = []
w = []
for i, s in enumerate(sg):
    print(i, s)
    print(train[s].shape)
    print(val[i])
    w = np.append(w, train[s].shape[0])
print(w)
print(np.average(val, weights=w))

In [None]:
np.mean(val_g.reshape(6,10), axis=1).mean()