In [1]:
import os
import pandas as pd

from pymatgen.core import Composition

In [2]:
##For PUL, the data needs to be converted into pymatgen composition format.
##And since it doesn't need any prior calculations,
##the calculations can be done directly on the full list of SMACT allowed compositions.

In [3]:
working_dir = os.getcwd()

In [4]:
##Import SMACT allowed compositions
df_oxide_smact_low = pd.read_csv('../../data/SMACT/smact_allowed_oxide_comps_low.csv', header=None)
df_sulfide_smact_low = pd.read_csv('../../data/SMACT/smact_allowed_sulfide_comps_low.csv', header=None)
df_selenide_smact_low = pd.read_csv('../../data/SMACT/smact_allowed_selenide_comps_low.csv', header=None)
df_telluride_smact_low = pd.read_csv('../../data/SMACT/smact_allowed_telluride_comps_low.csv', header=None)

In [5]:
##Function to change from composition to formula for SMACT data
def smact_formula(df):
    df['Formula'] = df.apply(lambda row: row[0]+row[1]+'2'+row[2]+'4', axis=1)
    df.drop(columns=[0, 1, 2], inplace=True)
    return df

df_oxide_smact_low = smact_formula(df_oxide_smact_low)
df_sulfide_smact_low = smact_formula(df_sulfide_smact_low)
df_selenide_smact_low = smact_formula(df_selenide_smact_low)
df_telluride_smact_low = smact_formula(df_telluride_smact_low)

In [6]:
df_oxide_smact_low['PMG_Composition'] = df_oxide_smact_low.apply(lambda row: Composition(row['Formula']).formula, axis=1)
df_sulfide_smact_low['PMG_Composition'] = df_sulfide_smact_low.apply(lambda row: Composition(row['Formula']).formula, axis=1)
df_selenide_smact_low['PMG_Composition'] = df_selenide_smact_low.apply(lambda row: Composition(row['Formula']).formula, axis=1)
df_telluride_smact_low['PMG_Composition'] = df_telluride_smact_low.apply(lambda row: Composition(row['Formula']).formula, axis=1)

In [7]:
df_oxide_smact_low

Unnamed: 0,Formula,PMG_Composition
0,AgAg2O4,Ag3 O4
1,AgAl2O4,Al2 Ag1 O4
2,AgAs2O4,Ag1 As2 O4
3,AgAu2O4,Ag1 Au2 O4
4,AgB2O4,Ag1 B2 O4
...,...,...
5052,ZrW2O4,Zr1 W2 O4
5053,ZrY2O4,Y2 Zr1 O4
5054,ZrYb2O4,Yb2 Zr1 O4
5055,ZrZn2O4,Zr1 Zn2 O4


In [8]:
##Add another column so that it matches the required input format for PUL calculations
df_sulfide_smact_low['CLscore'] = 0
df_selenide_smact_low['CLscore'] = 0
df_telluride_smact_low['CLscore'] = 0
df_oxide_smact_low['CLscore'] = 0

##Drop 'Formula' column since PUL calculations don't need it
df_oxide_smact_low.drop(columns=['Formula'], inplace=True)
df_sulfide_smact_low.drop(columns=['Formula'], inplace=True)
df_selenide_smact_low.drop(columns=['Formula'], inplace=True)
df_telluride_smact_low.drop(columns=['Formula'], inplace=True)

In [9]:
df_oxide_smact_low

Unnamed: 0,PMG_Composition,CLscore
0,Ag3 O4,0
1,Al2 Ag1 O4,0
2,Ag1 As2 O4,0
3,Ag1 Au2 O4,0
4,Ag1 B2 O4,0
...,...,...
5052,Zr1 W2 O4,0
5053,Y2 Zr1 O4,0
5054,Yb2 Zr1 O4,0
5055,Zr1 Zn2 O4,0


In [10]:
##Save to CSV files for PUL calculations
df_oxide_smact_low.to_csv('data_for_running/smact_allowed_oxide_comps_for_PUL.csv', index=False, header=None)
df_sulfide_smact_low.to_csv('data_for_running/smact_allowed_sulfide_comps_for_PUL.csv', index=False, header=None)
df_selenide_smact_low.to_csv('data_for_running/smact_allowed_selenide_comps_for_PUL.csv', index=False, header=None)
df_telluride_smact_low.to_csv('data_for_running/smact_allowed_telluride_comps_for_PUL.csv', index=False, header=None)

In [None]:
##To run PUL calculations, you need to download PUL model from https://github.com/kaist-amsg/Synthesizability-stoi-CGNF
##Then use the saved CSV files as input to run the calculations
##I used this command to run the calculations:
os.system(f"python predict_PU_learning.py --bag 100 --data {csv_file} --embedding cgcnn_hd_rcut4_nn8.element_embedding.json --modeldir ./models")
##In some cases, you might need to split the input files into smaller chunks to avoid issues.