## import modules

In [1]:
from compname import ChemFormula
from feature import CreateFeature
from mlops import WrapperMethod
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt

## Creating features

In [2]:
# loading csv
path = "./data/learning.csv"
df = pd.read_csv(path)
comp, sigma = df["X"].values, df["Y"].values

# creating moler ratio
path = "./data/atom.csv"
cn = ChemFormula(path)

# all, only F, not F
all_molratio = cn.get_molratio(comp)
f_molratio = cn.get_molratio(comp, obj_atoms=["F"])
cation_molratio = cn.get_molratio(comp, exc_atoms=["F"])

# creating molratio(dict)
dict_molratio = {
    "All" : all_molratio,
    "F" : f_molratio,
    "Cation" : cation_molratio
    }

# creating features of composition
cf = CreateFeature()

# get average features
dict_feature = cf.get_ave_features(dict_molratio)

# adding std features
dict_feature.update(cf.get_std_features(dict_molratio, exc=["F", "All"]))

# adding max features
dict_feature.update(cf.get_max_features(dict_molratio, exc=["F"]))

# adding min features
dict_feature.update(cf.get_min_features(dict_molratio, exc=["F"]))

# creating a new feature
dict_feature["Cation(Ave)/F(Ave)"] = dict_feature["Cation(Ave)"] / dict_feature["F(Ave)"]
dict_feature["All(Max)/F(Ave)"] = dict_feature["All(Max)"] / dict_feature["F(Ave)"]
dict_feature["All(Min)/F(Ave)"] = dict_feature["All(Min)"] / dict_feature["F(Ave)"]

# deleting a feature
del dict_feature["All(Max)"], dict_feature["All(Min)"], dict_feature["F(Ave)"]

# creating dataframe of learnning
df = cf.get_df_learning(dict_feature, comp)

# feature engneering using Wrapper Method
wm = WrapperMethod(X = df.values, y=sigma, feature_names=df.columns.values)
# Lasso
hyper_params = {
    'alpha':[0, 0.2, 0.4, 0.6, 0.8,1.0]
}
model = Lasso()
best_params_idx, cv_scores, train_scores, test_scores, best_model = wm.calc_forward(model, hyper_params)

100%|██████████| 146/146 [00:04<00:00, 32.47it/s]
100%|██████████| 146/146 [00:04<00:00, 32.03it/s]
100%|██████████| 146/146 [00:04<00:00, 31.22it/s]
100%|██████████| 146/146 [00:04<00:00, 31.42it/s]
100%|██████████| 146/146 [00:04<00:00, 30.09it/s]
100%|██████████| 146/146 [00:05<00:00, 29.12it/s]
100%|██████████| 146/146 [00:05<00:00, 27.77it/s]
100%|██████████| 146/146 [00:05<00:00, 28.77it/s]
100%|██████████| 146/146 [00:05<00:00, 28.94it/s]
100%|██████████| 146/146 [00:05<00:00, 27.16it/s]
100%|██████████| 146/146 [00:05<00:00, 28.52it/s]
100%|██████████| 146/146 [00:05<00:00, 27.50it/s]
100%|██████████| 146/146 [00:05<00:00, 26.83it/s]
100%|██████████| 146/146 [00:05<00:00, 27.02it/s]
100%|██████████| 146/146 [00:05<00:00, 26.59it/s]
100%|██████████| 146/146 [00:05<00:00, 26.46it/s]
100%|██████████| 146/146 [00:05<00:00, 25.81it/s]
100%|██████████| 146/146 [00:05<00:00, 26.39it/s]
100%|██████████| 146/146 [00:05<00:00, 26.27it/s]
100%|██████████| 146/146 [00:05<00:00, 26.01it/s]


In [3]:
df.columns.values[best_params_idx]

array(['Cation(Ave)@polar', 'Cation(Ave)@ea', 'All(Min)/F(Ave)@MDS_2',
       'Cation(Max)@melt_p', 'Cation(Max)@MDS_1',
       'Cation(Ave)/F(Ave)@MDS_3', 'All(Min)/F(Ave)@Cp',
       'Cation(Max)@vale_e_num', 'All(Ave)@polar', 'Cation(Std)@boil_p',
       'Cation(Ave)@MDS_1', 'Cation(Ave)@melt_p', 'Cation(Ave)@MDS_2',
       'Cation(Std)@ion_rad', 'Cation(Std)@valence',
       'All(Max)/F(Ave)@polar', 'All(Max)/F(Ave)@MDS_1', 'Cation(Ave)@ie',
       'All(Min)/F(Ave)@polar', 'Cation(Std)@MDS_3',
       'Cation(Ave)/F(Ave)@Cp', 'Cation(Ave)@rd', 'Cation(Ave)@en',
       'Cation(Std)@at_num', 'Cation(Std)@at_wt', 'Cation(Max)@MDS_2',
       'Cation(Max)@polar', 'All(Ave)@ion_rad', 'Cation(Ave)@rco',
       'Cation(Min)@ea-ion_e', 'Cation(Ave)@MDS_3',
       'Cation(Ave)/F(Ave)@ea', 'Cation(Ave)/F(Ave)@MDS_2'], dtype=object)

In [4]:
import pickle
pickle.dump(best_model, open(".data/lasso.sav", 'wb'))

FileNotFoundError: [Errno 2] No such file or directory: '.data/lasso.sav'