### Load in the Pickle Data File and Drop Duplicate Smple

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore") 

with open('feature.pkl', 'rb') as f:
    data = pickle.load(f)

data = data.drop_duplicates(subset = 'material_id')
data.head(5)

Unnamed: 0,Formula,Mag_atom,Near_Distance,SGR,Tc_avg,Tc_std,material_id,Tol_DOS,Fe_dDOS,Cr_dDOS,...,Co_dDOS,Coordnate_Number,Magnetic_moment,Bound_Length,lattice_a,lattice_b,lattice_c,lattice_alpha,lattice_beta,lattice_gamma
0,AlAu2Mn,Mn,4.495784914784067,225,216.5,16.5,mp-5491,0.206274,0.0,0.0,...,0.0,6.0,0.005,4.528144,4.528144,4.528144,4.528144,60.0,60.0,60.0
3,AlB2Fe2,Fe,2.685837244036948,65,288.0,13.735599,mp-3805,0.132829,0.099305,0.0,...,0.0,4.0,-0.013,2.785664,2.863114,2.915233,5.695537,104.828257,90.0,90.0
5,AlCCr2,Cr,2.7526898308246373,194,73.0,0.0,mp-9956,0.158591,0.0,0.130603,...,0.0,4.5,0.0,2.751682,2.843004,2.843004,12.707956,90.0,90.0,120.0
8,AlCMn3,Mn,2.738058878110549,221,294.25,9.120718,mp-4593,0.127507,0.0,0.0,...,0.0,8.0,-0.407333,2.691674,3.806601,3.806601,3.806601,90.0,90.0,90.0
16,Al2Ce2Co15,Co,2.436418136336501,166,751.0,0.0,mp-16484,1.120249,0.0,0.0,...,0.823275,8.0,1.18,2.475784,6.325907,6.325907,6.325907,82.507456,82.507456,82.507456


### Only Keep Important Features in Table: 
- Density of state related: 
    - Tol_DOS, Fe_dDOS, Cr_dDOS, Mn_dDOS, Co_dDOS
- Crystal structure related: 
    - Coordnate_Number, Magnetic_moment,  Bound_Length

In [2]:
## get rid off nuisance features and drop NAN
tab = data.drop(['Formula', 'Mag_atom', 'Near_Distance', 'Tc_std','material_id','SGR'],axis = 1)
tab = tab.dropna(axis=0)
print("The dimension of this tab is", tab.shape)

The dimension of this tab is (317, 15)


### Feature Scaling
- Apply standard feature **(StandardScaler)** scaling
    - z = (x - u)/s
    - x is the data point to be scale and u is the mean of data set and s is the standard deviation **(std)
    - The default behavior of is centering the data first and then scaling the data to unit std

In [3]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler(copy = True, with_mean = True, with_std = True)

for i in range(tab.shape[1]):
    feature = np.array(tab.iloc[:,i]).reshape(tab.shape[0],1)
    tab.iloc[:,i] = scalar.fit_transform(feature)

### Compute Pair Correlation Between Features###
- High correlcation between 'Tol_DOS' and 'Fe_dDOS' and 'Co_dDOS' 
- Shows that there is more samples contains Fe and Co element

In [4]:
import seaborn as sn
import matplotlib.pyplot as plt

plt.figure(figsize = (7, 7), dpi = 100)
sn.heatmap(tab.corr(), annot = True, annot_kws = dict(fontsize = 11), fmt = '.1f', cmap = 'YlGnBu', linewidths=.5)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1cfb6940>

### Shuffle the Orders of Sample, and Separate the Table to Input "X" and Lable "Y"

In [5]:
tab = tab.sample(frac = 1, random_state = 5).reset_index(drop = True)
X = tab.drop('Tc_avg', axis = 1)
Y = pd.DataFrame(tab['Tc_avg'])

### Import models: ###
- Support Vector Regression
- Kernel Ridge Regression 
- Random Forest 
- Gradient Boosting

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

### Apply Grid Search with 5 Folds Cross Validation
- Using R square and and explained_variance_score as metric for this regression model 
- reference for Train/Test Split and Cross Validation
    - https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6

### Define The Functions For Implementing Grid Search and Report Results

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, make_scorer
from tabulate import tabulate
from IPython.display import display, HTML

pd.options.display.max_colwidth = 500

def report_grid_search(search_result, parameters, show_number):
    print("best_socres = %.3f "% search_result.best_score_)
    print("best_parameters = %r" %search_result.best_params_)
    
    means = search_result.cv_results_['mean_test_score']
    stds = search_result.cv_results_['std_test_score']
    params = search_result.cv_results_['params']
    
    results = pd.DataFrame({'means': means, "stds": stds, "params": params})
    results = results.sort_values(by = ['means'], ascending = False)
    display(HTML(results.head(show_number).to_html()))

def grid_search(model, parameters, X, Y):
    scorer = make_scorer(r2_score, greater_is_better = True)
    grid_search = GridSearchCV(model, parameters, cv = 5, scoring = scorer)
    grid_search.fit(X, Y)
    return grid_search

### Fine Tuned Parameters for Support Vector Regression (SVR) Model
- **Gamma**: Kernel coefficient for rbf, must be greater than 0.
- **C**: Penalty parameter C of the error term.
- **Epsilon**: a margin of tolerance (epsilon).
- **Url**: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [24]:
parameters = [{'C': np.arange(3, 6, 0.1), 'epsilon': np.arange(0, 1, 0.05), 'gamma': np.arange(0.03, 0.07, 0.005) }]
model = SVR(kernel = 'rbf')
SVR_grid_search = grid_search(model, parameters, X, Y)
report_grid_search(SVR_grid_search, parameters, show_number = 5)

best_socres = 0.406 
best_parameters = {'C': 5.000000000000002, 'epsilon': 0.2, 'gamma': 0.03}


Unnamed: 0,means,stds,params
3636,0.406148,0.15182,"{'C': 5.000000000000002, 'epsilon': 0.2, 'gamma': 0.03}"
3456,0.406034,0.152226,"{'C': 4.900000000000002, 'epsilon': 0.2, 'gamma': 0.03}"
3816,0.40595,0.151862,"{'C': 5.100000000000001, 'epsilon': 0.2, 'gamma': 0.03}"
3276,0.405868,0.152611,"{'C': 4.800000000000002, 'epsilon': 0.2, 'gamma': 0.03}"
3096,0.405544,0.153146,"{'C': 4.700000000000001, 'epsilon': 0.2, 'gamma': 0.03}"


### Fine Tuned Parameters for Kernel Ridge Regression (KRR) Model
- **Alpha**: Penalty parameter to improve the conditioning of the problem and reduce the variance of the estimates.
- **Gamma**: Coefficient for the kernel.
-  **Url**:https://scikitlearn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html#sklearn.kernel_ridge.KernelRidge

In [40]:
parameters = [{'alpha': np.arange(0, 0.1, 0.01), 'gamma': np.arange(0.01, 0.03, 0.001) }]
model = KernelRidge(kernel = 'laplacian')
KRR_grid_search = grid_search(model, parameters, X, Y)
report_grid_search(KRR_grid_search, parameters, show_number = 5)

best_socres = 0.481 
best_parameters = {'alpha': 0.07, 'gamma': 0.02299999999999999}




Unnamed: 0,means,stds,params
153,0.480595,0.070253,"{'alpha': 0.07, 'gamma': 0.02299999999999999}"
152,0.480572,0.070286,"{'alpha': 0.07, 'gamma': 0.021999999999999992}"
154,0.480565,0.070217,"{'alpha': 0.07, 'gamma': 0.023999999999999987}"
130,0.480523,0.069365,"{'alpha': 0.06, 'gamma': 0.01999999999999999}"
176,0.48051,0.071138,"{'alpha': 0.08, 'gamma': 0.02599999999999999}"


### Fine Tuned Parameters for Random Forest Regression Model (RFR)
- **n_estimators**: The number of trees in the forest.
- **max_depth**: The maximum depth of the tree. 
- **min_samples_split**: The minimum number of samples required to split an internal node.
- **url**: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html 

In [39]:
parameters = {'max_depth': [10], 'n_estimators': [1000]}
model = RandomForestRegressor(random_state = 3)
RFG_grid_search = grid_search(model, parameters, X, Y)
report_grid_search(RFG_grid_search, parameters, show_number = 20)

best_socres = 0.482 
best_parameters = {'max_depth': 10, 'n_estimators': 1000}


Unnamed: 0,means,stds,params
0,0.482233,0.103917,"{'max_depth': 10, 'n_estimators': 1000}"


## Ensemble Learning ##
- **Bagging:** Average the outputs of above four model to produce the prediction
    - **url:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
- **Stacking:** combine above four models by training a linear regression model

## Define Functions For Ensemble Learning ##

In [60]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble.voting import VotingRegressor

def model_predictions(models, data):
    X_test, y_test, X_train, y_train = data
    y_preds = []
    for model in models:
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test)
        # to handle the dimensional issue for KRR
        y_pred = y_pred.reshape(len(y_pred))
        y_preds.append(y_pred)
    return y_preds

def avg_model_prediction(models, data):
    y_preds = model_predictions(models, data)
    y_preds = np.array([y_preds]).squeeze(axis = 0)
    y_preds = y_preds.mean(axis = 0)
    return y_preds

def split_test_train(Data, indexes):
    X, Y = Data
    train_index, test_index = indexes
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    return (X_test, y_test, X_train, y_train)

def evalute_model_score(model):
    scorer = make_scorer(r2_score, greater_is_better = True)
    scores = cross_val_score(model, X, Y, cv = 5, scoring = scorer) 
    return scores

def ensemble_bagging_R2(models, Data, k_fold = 5):
    R_squares = []
    kf = KFold(n_splits = k_fold)
    for train_index, test_index in kf.split(X):
        split_data = split_test_train(Data, (train_index, test_index))
        y_pred = avg_model_prediction(models, split_data)
        y_test = split_data[1]
        R_squares.append(r2_score(y_test, y_pred))
    return np.array([R_squares]).squeeze(axis=0)


def ensemble_stacking_R2(models, Data, k_fold = 5):
    R_squares = []
    kf = KFold(n_splits = k_fold)
    for train_index, test_index in kf.split(X):
        split_data = split_test_train(Data, (train_index, test_index))
        y_pred = avg_model_prediction(models, split_data)
        y_test = split_data[1]
        R_squares.append(r2_score(y_test, y_pred))
    return np.array([R_squares]).squeeze(axis=0)

## Define Models ##

In [57]:
SVR_model = SVR(kernel = 'rbf', C = 5, epsilon = 0.2, gamma = 0.03)
KRR_model = KernelRidge(kernel = 'laplacian', alpha = 0.07, gamma = 0.03)
RFG_model = RandomForestRegressor(max_depth = 10, min_samples_split= 2, n_estimators = 50, random_state = 3)
GBR_model = GradientBoostingRegressor(learning_rate = 0.1, loss = 'lad', max_depth = 30, min_samples_split = 6, n_estimators = 1000, random_state = 3)
models = [SVR_model, KRR_model, GBR_model, RFG_model]

## Evaluate Perfomance of Each Model ##

In [62]:
model_name = ['Support Vector Regression', 'Kernel Ridge Regression'
              , 'Random Forest Regression', 'Gradient Boosting Regression']
for i, model in enumerate(models):
    scores = evalute_model_score(model)
    print('The R2 of %s model is %.2f with std %.2f' %(model_name[i] ,scores.mean(), scores.std()))

The R2 of Support Vector Regression model is 0.41 with std 0.15
The R2 of Kernel Ridge Regression model is 0.48 with std 0.07
The R2 of Random Forest Regression model is 0.46 with std 0.10
The R2 of Gradient Boosting Regression model is 0.48 with std 0.12


## Bagging ##

In [63]:
Data = (X, Y)
models = [SVR_model, KRR_model, GBR_model, RFG_model]
bagging_scores = ensemble_bagging_R2(models, Data, k_fold = 5)
print('the R2 of bagging method is %.2f with std %.2f' %(bagging_scores.mean(), bagging_scores.std())) 

the R2 of bagging method is 0.51 with std 0.11


## Stacking ##

In [None]:
Data = (X, Y)
models = [SVR_model, KRR_model, GBR_model, RFG_model]
stacking_scores = ensemble_stacking_R2(models, stack_model, Data, k_fold = 5)
print('the R2 of stacking method is %.2f with std %.2f' %(stacking_scores.mean(), stacking_scores.std())) 