In [1]:
from approx.approximate import ApprOXimate
from approx.feature_engineering import MaterialFeatureExtractor
from mendeleev.fetch import fetch_table

In [2]:
import pandas as pd

df = pd.read_csv('Crystal_structure.csv')

df

Unnamed: 0,Compound,A,B,In literature,v(A),v(B),r(AXII)(Å),r(AVI)(Å),r(BVI)(Å),EN(A),EN(B),l(A-O)(Å),l(B-O)(Å),ΔENR,tG,τ,μ,Lowest distortion
0,Ac2O3,Ac,Ac,False,0,0,1.12,1.12,1.12,1.10,1.10,0.00000,0.000000,-3.248000,0.707107,-,0.800000,cubic
1,AcAgO3,Ac,Ag,False,0,0,1.12,1.12,0.95,1.10,1.93,0.00000,2.488353,-2.565071,0.758259,-,0.678571,orthorhombic
2,AcAlO3,Ac,Al,False,0,0,1.12,1.12,0.54,1.10,1.61,0.00000,1.892894,-1.846714,0.918510,-,0.385714,cubic
3,AcAsO3,Ac,As,False,0,0,1.12,1.12,0.52,1.10,2.18,0.00000,1.932227,-1.577429,0.928078,-,0.371429,orthorhombic
4,AcAuO3,Ac,Au,False,0,0,1.12,1.12,0.93,1.10,2.54,0.00000,2.313698,-2.279786,0.764768,-,0.664286,orthorhombic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,ZrWO3,Zr,W,False,1,5,0.89,0.72,0.62,1.33,2.36,2.38342,1.745600,-1.572214,0.801621,5.228952455,0.442857,cubic
5325,ZrYO3,Zr,Y,False,-,-,0.89,0.72,0.90,1.33,1.22,2.38342,2.235124,-2.489571,0.704032,-,0.642857,cubic
5326,ZrYbO3,Zr,Yb,False,-,-,0.89,0.72,0.95,1.33,1.10,2.38342,2.223981,-2.626821,0.689053,-,0.678571,orthorhombic
5327,ZrZnO3,Zr,Zn,False,-,-,0.89,0.72,0.74,1.33,1.65,2.38342,2.096141,-2.035750,0.756670,-,0.528571,cubic


# Data Cleaning

In [3]:
df = df[['Compound', 'Lowest distortion']]
df

Unnamed: 0,Compound,Lowest distortion
0,Ac2O3,cubic
1,AcAgO3,orthorhombic
2,AcAlO3,cubic
3,AcAsO3,orthorhombic
4,AcAuO3,orthorhombic
...,...,...
5324,ZrWO3,cubic
5325,ZrYO3,cubic
5326,ZrYbO3,orthorhombic
5327,ZrZnO3,cubic


# Feature Engineering

In [5]:
# approx = ApprOXimate()
# ptable = fetch_table("elements")

# extractor = MaterialFeatureExtractor(approx, ptable, mode="both")
# extractor.get_features("Li2MnO3")

In [6]:
from tqdm import tqdm

def featurize_df(df, extractor, formula_col="Compound"):
    feature_rows = []

    for f in tqdm(df[formula_col], desc="Feat compounds"):
        feats = extractor.get_features(f)
        feature_rows.append(feats)

    feat_df = pd.DataFrame(feature_rows)
    return df.join(feat_df)


# Initialize modules
approx = ApprOXimate()
ptable = fetch_table("elements")
extractor = MaterialFeatureExtractor(approx, ptable, mode="all")

# Run featurization
df_feat = featurize_df(df, extractor)
df_feat.head()

Feat compounds: 100%|██████████| 5329/5329 [44:20<00:00,  2.00it/s]  


Unnamed: 0,Compound,Lowest distortion,formula,all_valence_s_sum,all_valence_s_avg,all_valence_s_dev,all_valence_s_min,all_valence_s_max,all_valence_s_range,all_valence_s_mode,...,all_gordy_en_max,all_gordy_en_range,all_gordy_en_mode,all_mb_en_sum,all_mb_en_avg,all_mb_en_dev,all_mb_en_min,all_mb_en_max,all_mb_en_range,all_mb_en_mode
0,Ac2O3,cubic,Ac2O3,6.0,1.2,0.979796,0.0,2.0,2.0,0.0,...,0.631476,0.590432,0.631476,0.142857,0.028571,0.023328,0.0,0.047619,0.047619,0.0
1,AcAgO3,orthorhombic,AcAgO3,6.0,1.2,0.979796,0.0,2.0,2.0,0.0,...,0.631476,0.590432,0.631476,0.22619,0.045238,0.026513,0.0,0.083333,0.083333,0.0
2,AcAlO3,cubic,AcAlO3,6.0,1.2,0.979796,0.0,2.0,2.0,0.0,...,0.631476,0.590432,0.631476,0.142857,0.028571,0.023328,0.0,0.047619,0.047619,0.0
3,AcAsO3,orthorhombic,AcAsO3,8.0,1.6,0.8,0.0,2.0,2.0,2.0,...,0.631476,0.590432,0.631476,0.309524,0.061905,0.055533,0.0,0.166667,0.166667,0.0
4,AcAuO3,orthorhombic,AcAuO3,6.0,1.2,0.979796,0.0,2.0,2.0,0.0,...,0.7082,0.667156,0.631476,0.223665,0.044733,0.025797,0.0,0.080808,0.080808,0.0


In [None]:
df_feat.to_csv('featurised_data.csv', index=False)

: 