In [1]:
import numpy as np
import itertools
import smact
from smact.screening import pauling_test
import signal

class TimeoutException(Exception):  
    pass  
  
def timeout_handler(signum, frame):  
    raise TimeoutException("Function execution timed out") 

def timeout(seconds):  
    def decorator(func):  
        def wrapper(*args, **kwargs):  
            signal.signal(signal.SIGALRM, timeout_handler)  
            signal.alarm(seconds)  
            try:  
                result = func(*args, **kwargs)  
            finally:  
                signal.alarm(0)  # Disable the alarm  
            return result  
        return wrapper  
    return decorator

@timeout(1)
def smact_validity(
    comp: tuple[int, ...] | tuple[str, ...],
    count: tuple[int, ...],
    use_pauling_test: bool = True,
    include_alloys: bool = True,
    include_cutoff: bool = False,
    use_element_symbol: bool = False,
) -> bool:
    """Computes SMACT validity.

    Args:
        comp: Tuple of atomic number or element names of elements in a crystal.
        count: Tuple of counts of elements in a crystal.
        use_pauling_test: Whether to use electronegativity test. That is, at least in one
            combination of oxidation states, the more positive the oxidation state of a site,
            the lower the electronegativity of the element for all pairs of sites.
        include_alloys: if True, returns True without checking charge balance or electronegativity
            if the crystal is an alloy (consisting only of metals) (default: True).
        include_cutoff: assumes valid crystal if the combination of oxidation states is more
            than 10^6 (default: False).

    Returns:
        True if the crystal is valid, False otherwise.
    """
    assert len(comp) == len(count)
    if use_element_symbol:
        elem_symbols = comp
    else:
        elem_symbols = tuple([get_element_symbol(Z=elem) for elem in comp])  # type:ignore
    space = smact.element_dictionary(elem_symbols)
    smact_elems = [e[1] for e in space.items()]
    electronegs = [e.pauling_eneg for e in smact_elems]
    ox_combos = [e.oxidation_states for e in smact_elems]
    if len(set(elem_symbols)) == 1:
        return True
    if include_alloys:
        is_metal_list = [elem_s in smact.metals for elem_s in elem_symbols]
        if all(is_metal_list):
            return True

    threshold = np.max(count)
    compositions = []
    n_comb = np.prod([len(ls) for ls in ox_combos])
    # If the number of possible combinations is big, it'd take too much time to run the smact checker
    # In this case, we assum that at least one of the combinations is valid
    if n_comb > 1e6 and include_cutoff:
        return True
    for ox_states in itertools.product(*ox_combos):
        stoichs = [(c,) for c in count]
        # Test for charge balance
        cn_e, cn_r = smact.neutral_ratios(ox_states, stoichs=stoichs, threshold=threshold)
        # Electronegativity test
        if cn_e:
            if use_pauling_test:
                try:
                    electroneg_OK = pauling_test(ox_states, electronegs)
                except TypeError:
                    # if no electronegativity data, assume it is okay
                    electroneg_OK = True
            else:
                electroneg_OK = True
            if electroneg_OK:
                for ratio in cn_r:
                    compositions.append(tuple([elem_symbols, ox_states, ratio]))
    compositions = [(i[0], i[2]) for i in compositions]
    compositions = list(set(compositions))
    if len(compositions) > 0:
        return True
    else:
        return False


from collections import Counter
from tqdm import tqdm
import numpy as np

def check_validity(data):    
    fail, success = 0, 0
    for structure in tqdm(data):
        comp, count = [], []
        temp = Counter(structure)
        for k, v in temp.items():
            comp.append(k)
            count.append(v)
        count = np.array(count)
        try:
            count = count / np.gcd.reduce(count)
            count = count.astype(int)
            if smact_validity(tuple(comp), tuple(count), use_element_symbol=True):
                success += 1
            else:
                fail += 1
        except Exception as e:
            fail += 1
    return success, fail

In [18]:
# Unconditioned

import pickle as pkl
import re

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/uncondition_summary_1w/1/material.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/uncondition_summary_1w_1w/1/material.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/uncondition_summary_1w/1/material.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/0.75_material.txt",
]


for f in files:
    print(f)
    with open(f, "rb") as f:
        data = pkl.load(f)
    print(data[0])
    structures = [] 
    complete_structures = []
    total = 0
    for d in data:
        total += 1
        groups=re.search(r"<material> ?([A-Z][a-z]? ?)+<sg\d+> ?</material>", d)
        if groups:
            line = groups.group(0)
            elements = re.findall(r"[A-Z][a-z]?", line)
            complete_structures.append(elements)
    com_total = len(complete_structures)
    print(f"complete data: {com_total}/{total}={com_total/total:.4f}")
    success, fail = check_validity(complete_structures)
    #total = len(data)
    print(f"Success: {success}, Fail: {fail}, Total: {total}")
    print(f"Success rate/com: {success}/{com_total}={success/com_total:.4f}")
    print(f"Success rate/all: {success}/{total}={success/total:.4f}")


/sfmdataeastus2/nlm/zekun/instruct/base1b/uncondition_summary_1w/1/material.pkl


<material>MgMgMgCoOOOOOOOOSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS<sg1></material>
complete data: 33180/54488=0.6089


100%|██████████| 33180/33180 [04:12<00:00, 131.34it/s]


Success: 16326, Fail: 16854, Total: 54488
Success rate/com: 16326/33180=0.4920
Success rate/all: 16326/54488=0.2996
/sfmdataeastus2/nlm/zekun/instruct/base8b/uncondition_summary_1w_1w/1/material.pkl
<material>CrCrCrCrCr<sg225></material>.Thismaterialexhibitsaformationenergyperatomof-0.1174.Thismaterialhasanenergyabovehullof0.0152.Thematerialhasatotalnormalizedmagnetizationvolumeof0.01975.BelongingtothespacegroupFm-3m,thematerialhasauniquestructure.
complete data: 9323/12421=0.7506


100%|██████████| 9323/9323 [00:59<00:00, 156.16it/s]


Success: 5921, Fail: 3402, Total: 12421
Success rate/com: 5921/9323=0.6351
Success rate/all: 5921/12421=0.4767
/sfmdataeastus2/nlm/zekun/instruct/8x7b/uncondition_summary_1w/1/material.pkl
<material>CuCuOOO<sg71></material>.Thematerialiscomprisedoftheelementscopper,chromium,lithium,andoxygen.ThespacegroupImmmhousesthismaterial'sstructure.Thetotalnormalizedmagnetizationvolumeofthismaterialis0.003641.Anoteworthyfeatureofthismaterialisitsenergyabovehullof2.0706.
complete data: 11999/13833=0.8674


100%|██████████| 11999/11999 [00:05<00:00, 2105.20it/s]

Success: 7290, Fail: 4709, Total: 13833
Success rate/com: 7290/11999=0.6076
Success rate/all: 7290/13833=0.5270





In [20]:
# Unconditioned

import pickle as pkl
import re

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material.txt",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material_new.txt",
]


for f in files:
    print(f)
    with open(f, "r") as f:
        data = f.readlines()
    print(data[0])
    structures = [] 
    complete_structures = []
    total = 0
    for d in data:
        total += 1
        groups=re.search(r"(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", d)
        if groups:
            line = groups.group(0)
            elements = re.findall(r"[A-Z][a-z]?", line)
            complete_structures.append(elements)
    com_total = len(complete_structures)
    print(f"complete data: {com_total}/{total}={com_total/total:.4f}")
    success, fail = check_validity(complete_structures)
    #total = len(data)
    print(f"Success: {success}, Fail: {fail}, Total: {total}")
    print(f"Success rate/com: {success}/{com_total}={success/com_total:.4f}")
    print(f"Success rate/all: {success}/{total}={success/total:.4f}")


/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material.txt
<i>Ac <i>Ac <i>Br <i>Br <i>Br <i>I <sg164> </material>  . This material's formation energy per atom is recorded at -1.1818. In the realm of space groups, this material belongs to P -3 m 1. The material's total normalized magnetization volume measures at 3.207e-06.</s>

complete data: 6814/7729=0.8816


100%|██████████| 6814/6814 [00:03<00:00, 2116.55it/s]


Success: 4458, Fail: 2356, Total: 7729
Success rate/com: 4458/6814=0.6542
Success rate/all: 4458/7729=0.5768
/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material_new.txt
<i>Al <i>Fe <i>Fe <i>Ho <sg225> </material> </s>

complete data: 8775/10000=0.8775


100%|██████████| 8775/8775 [00:04<00:00, 1954.49it/s]

Success: 5798, Fail: 2977, Total: 10000
Success rate/com: 5798/8775=0.6607
Success rate/all: 5798/10000=0.5798





In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt

def plot_lengths_distribution(lengths):
    print(f"Max: {np.max(lengths)}, Min: {np.min(lengths)}, Mean: {np.mean(lengths)}")
    plt.hist(lengths, bins=100)
    plt.show()

for f in files:
    with open(f, "rb") as f:
        data = pkl.load(f)
    lengths = []
    for structure in tqdm(data):
        comp, count = [], []
        elements = re.findall(r"[A-Z][a-z]*", structure)
        lengths.append(len(elements))
    plot_lengths_distribution(lengths)

In [6]:
for f in files:
    with open(f, "rb") as f:
        data = pkl.load(f)
    c = 0
    for structure in tqdm(data):
        elements = re.findall(r"[A-Z][a-z]*", structure)
        if len(elements) > 1024:
            c += 1
            if c < 10:
                print(structure)

  5%|▌         | 1495/27364 [00:00<00:01, 14948.77it/s]

 67%|██████▋   | 18213/27364 [00:01<00:00, 15113.11it/s]

<material>LiLiLiLiLiLiLiLiLiLiLiLiLiLiLiLiLiPPPPPPPPPPPPPPPPPPPPPPPPPPPPPOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO<sg1></material>.O</DNA>3′</DNA>(GAACTGGAGTTGAGCCGAGAGGTTCCAGCGCAGCGCAGCGCTCTTTTACAGTGCAGAGAGGGCAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGCCGGAGTTGAGC

100%|██████████| 27364/27364 [00:01<00:00, 15052.80it/s]
 17%|█▋        | 9277/54488 [00:00<00:01, 30871.59it/s]

<material>LuAlAlAlAlAlAlAlAlAlAlAlAlAlAlAlAlAlCoOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO<sg2></material>isthe</GENESGRGACGc=<GENE>-G</GENE>-GGCAGCAGCAGGGGGGGGCGGGCGGAG-</GENE>G</GENE>GAGG</GENE>GAGC</GENE>GAGCAG</GENE>GAGCAG</GENE>GAGCAG</GENE>GAGCAG</GENE>GAGCAG</GENE>GAGCAG</GENE>GAGGAG</GENE>GAGCAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAGGAG</GENE>GAG

 50%|█████     | 27247/54488 [00:00<00:00, 29204.42it/s]

<material>BaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBaBiBaBaCaLaReReReReReReReReReReReReReReReReReReReReReReReReReBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrBrNOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO<sg1></material>.</DNA>C</DNA><DNA>GTTCTCGCTTCCTCCAAATCGCA</DNA>TGTTCGGTTCTCGCTCCATTTCCCCTTGCCAAAGGACCAGCTACTTATCGCGGCTTTATGACGCGGGCCCCTTTCACAGGTCCAGGGGAACATTGCCACTGCCTGCCTTTGGTTCTCGTGCC</DNA>TAAACTCTCACGGTTTTTTCGCTCCGGCTTCCTCATGCCTTCTCAGATTGCTGTCCATCGATGGAGGTGCAGGATGAGGTGGGTGTCTGTCCGGTTCTAGGTCCGAGACAGACTGGGTCTAAATTATCTCCTCGGTCTACACTCACCAGCAAGTTCCGGTTTTTTGAAACTTTTTGCAACACAGGGGGTCCCACCAGAGCCTCGGGCTTCCGATGGTGACGGATTACTCTGAATGGCATGGTTTTCTTCGCTAAGGAAAGGGCACCACGGGGTCAACACACCAGGGGAGACACCAGAAATTAAACCCGGGAGACATCCACATTACGGCTGTCCTTGTCATGGTTTCACGGTTTTTCACGGTTTTGGGTGGATCCATCCACCACGCCTGTGAGCAGGGGAGACACGACAGGGGGGTGTGGTTGGGTGTGTTTCACGGCGACGGTTTTGGGTCCTTGTCATGGTTTCACGGTTTTGGGACAGAGCCGTGAGACCCCTGGGTCTGGCGGAGGGTGACGGCTTTGGGGACACCTGTGAGCAGGGGGTGGG

 61%|██████    | 33237/54488 [00:01<00:00, 29408.99it/s]

<material>LiPPPPPPPPPPPPPPPPPPPPPOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO<sg9></material>.MATERIALSCMLCKHTSPLPSKRDFAVALGVCLFAVVLLFIALFVVRRCVPRHYGSDEVGSRQGGEEAVPSVPVSTSSSGSPSETTPLTGLSAPAVFSSLPPPPELGLDVKPLIVGEQASRPLRRAGPREREHHHHYPPVVQRYSVEQGGSLALGLGSPCVGEAGVGKGSRLPPRLGVEESPAFSFEDVCPVCGDKVSGYHYGLLTCESCKGFFKRTVQNKKAYTCIAEKSCQIDKTQRKRCPFCRFQKCLEVGMKLEAVRADRMRGGRNKFGPMYKRDRARKLQTMRRRQPRGGASFMLGDTLQDAASSSSTTNTGPSLPILGLDLGFMTPEPRAEEVARVLSWQPPDLSAAFPALGPKEVHLVVFQTLLKRTSLPAGQSTELIQRAAGTRGDFQAQAVMLMALGKVLALAAHGAGPVTLGDEAAGLAGGPPGMVGGSQEHQEGARLSLGSPGGRLRVPESVLASPPEADLTVSFPASFPGAPGPPEGPLVHRPSLRRTPRPGSPSRAAAPGAGASWGATLLGPALPQPADLQAGAGGPQARLPRAHGVRSAPRVARRGSWRGAEPPDSGRSAVLAARTEAHFSAAPLLRPTPAGPPAVRRSEGLSTVSAGRAAWRGSPPQLRAAGAAALLPDGALGGLLERVKAELAAEVPVGREAAGPAARGLLAAFVRPGPGTGAGSGQAAQLPGGQLQDGERGPLGERRLREAEARLLPAGAASTESPGPPLASALGRLQQSPLSAAPVAAPPPEQPVLPPGAGQQLLARSGLEELDELLPPPPAPPAETVGQAGLGAAAEAGEETPLEEEGGSPEAEEAAARPPRSCRGPLSQQAPGRQRLHRTVSVLPPTEATASLTATAGLSGGEPLTPALEGPRGPRKGPSPG

 78%|███████▊  | 42262/54488 [00:01<00:00, 29766.55it/s]

<material>MgNdNdNdOOOOOSr<sg12></material>.METHODS:Proteinsequence.TheFASTPSETFYFRCEGEAVHFAAGSGIGGKFGYIRTQASGNQYDTVQGYEQDSRGYSVAINGTITERFTFVAPSDYLDTRPIGTFFSERQTERLEQQLEGGAVLVVGAAGMGKSTLVRSLALQWAQGTAWFAHYGWTDEEEALSEDLPQVFQTLDDTALLRQMRATGQDRISLGHGAWAAVPAHFKAASAPALTRPVDQAKLRNQLVTTLAALQQSGRGWQAVLLVDALDEIAPDGLERLAALIRGFSKAGRNLRLLISTRPEQGTPHALAEAIDGEDAEEVFFLLPPTLPTRNRSRIGAAYRQALTTLPDGASISLHREEVVPQPLLVAALGLVRRFPKSTPFIQRAALLTLEVLFEARNKLGAGEHSSLGTLDELGLAIGAVHAHHPTHGARTSSADDLAITERIAALAADMLRILPRADAVSRAGLWSDGPPPNVSVDDRHAGLALRQLRDTVNTLLASDPDSETLDTWSRPVLDDHWPAEQADLALFAAVQGFPTDPRQLVRADQSYLYHQRYLSLYWLGHHAKQGNERLALGLLAALDNDELHRTAAIATTDDERDLASAAGGALAIGFDPAAPRHRLAAWAEAIAERPGTQQAPTAELIALTRRRPRDPWRETTTAYTAALSTSLSTTSTADGHHVVFALENARDARRGDAEEAPVPGLDADQLPVAVRYLVWHLDNHWLGLATGLGSWLRNTLSDPEVPSLLDLSHRRLTSLPRPGAEHVAALGIRGRRDLALGVASALERADDPDAAALPVSAAWRDRLRIALQDHPEFDHPRDSDEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPDGESDPEEPD

100%|██████████| 54488/54488 [00:01<00:00, 29707.16it/s]


<material>HfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfHfScNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiNiP<sg1></material>.</DNA><DNA>TAAC</DNA><DNA>TTAAG</DNA><DNA>TCAC</DNA></DNA></DNA><DNA>ATGCACCTGAGAGGAAGA</DNA><DNA>CTAGA</DNA><DNA>CTACGAGTGT</DNA><DNA>TTAA</DNA><DNA>CTCAAGCCTTCCT</DNA></DNA><DNA>AGGCGTCCGGT</DNA><DNA>GTAGG</DNA><DNA>AGGTTTTGCAGC</DNA><DNA>GGTCTTCCTTGG</DNA><DNA>TTAAC</DNA><DNA>CTACGAGTGA</DNA><DNA>GGTACTTGCAGC</DNA><DNA>CTACGAGCTCC</DNA><DNA>AGGTTTTGCAGC</DNA><DNA>CTACGAGTCTCCTTGT</DNA><DNA>AGGACTTCACTTGG</DNA><DNA>GGTATATACAGTCGAGT</DNA><DNA>TTCATAC</DNA><DNA>CGT</DNA><DNA>CGGCGATCC</DNA><DNA>GGCGGTTCGTAT</DNA><DNA>CTCCTCCACTACAGTTC</DNA><DNA>GGATTTGGCTCG</DNA><DNA>ATGGACTTATCGG</DNA><DNA>GGCTGCGGAACTCA</DNA><DNA>GTATATACCAGCTTG</DNA><DNA>ACTCTGTCATACCGT</DNA><DNA>AGTGCTTCACCGTT</DNA><DNA>ATGGCTGCATAC</DNA><DNA>CTACGAGGTTTTTTG</DNA><DNA>TGTGAGAACTTCACCT</DNA><DNA>CTACT

100%|██████████| 50640/50640 [00:03<00:00, 15892.70it/s]
100%|██████████| 55953/55953 [00:03<00:00, 18409.49it/s]
100%|██████████| 6048/6048 [00:00<00:00, 16496.90it/s]
100%|██████████| 15302/15302 [00:00<00:00, 30191.66it/s]
100%|██████████| 13833/13833 [00:00<00:00, 71791.05it/s]


In [17]:
import pickle as pkl
import re


files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/mixtral/7755/test.comp_to_material.response.pkl.0.75"
]


files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/mixtral/7755/test.comp_to_material.response.pkl.0.75",

    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl",

    #"/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    #"/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",

    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl",

    #"/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/mixtral_2585/test.bandgap_to_mat.response.pkl",
    #"/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/mixtral_2585/test.bulk.response.pkl",
]


for f in files:
    print(f)
    with open(f, "rb") as f:
        data = pkl.load(f)
    print(data[0])
    print(len(data[0][-1]))
    structures = [] 
    complete_structures = []
    total = 0
    for d in data:
        responses = list(itertools.chain(*d[1:]))
        for response in responses:
            total += 1
            groups=re.search(r"<material> ?(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", response)
            if groups:
                line = groups.group(0)
                elements = re.findall(r"[A-Z][a-z]?", line)
                complete_structures.append(elements)
                #break
    com_total = len(complete_structures)
    print(f"complete data: {com_total}/{total}={com_total/total:.4f}")
    success, fail = check_validity(complete_structures)
    #total = len(data)
    print(f"Success: {success}, Fail: {fail}, Total: {total}")
    print(f"Success rate/com: {success}/{com_total}={success/com_total:.4f}")
    print(f"Success rate/all: {success}/{total}={success/total:.4f}")
    

/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl
('Create a substance that includes V, Pb, O', ['<material><i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg14></material>', '<material><i>V<i>V<i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg11></material>', '<material><i>V<i>V<i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg1></material>', '<material><i>V<i>V<i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg11></material>'], ['<material><i>V<i>V<i>V<i>V<i>V<i>V<i>V<i>V<i>V<i>V<i>V<i>V

complete data: 13945/16480=0.8462


100%|██████████| 13945/13945 [00:09<00:00, 1518.57it/s]


Success: 11069, Fail: 2876, Total: 16480
Success rate/com: 11069/13945=0.7938
Success rate/all: 11069/16480=0.6717
/sfmdataeastus2/nlm/zekun/instruct/mixtral/7755/test.comp_to_material.response.pkl.0.75
('Design a material containing Tm, Os', ['<material> <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Os <i>Os <i>Os <i>Os <i>Os <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <i>C <sg6> </material>', '<material> <i>Tm <i>Tm <i>Os <i>Os <i>Os <i>Os <i>Os <i>Os <sg63> </material>', '<material> <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Os <i>Os <i>Os <i>Os <sg62> </material>', '<material> <i>Tm <i>Tm <i>Tm <i>Tm <i>Os <i>Os <i>Os <i>Os <i>Os <i>Os <i>Os <i>Os <sg63> </material>'], ['<material> <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Os <i>Os <i>Os <i>Os <sg62> </material>', '<material> <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Tm <i>Os <i>Os <i>Os

100%|██████████| 31425/31425 [00:33<00:00, 950.09it/s] 


Success: 25631, Fail: 5794, Total: 32960
Success rate/com: 25631/31425=0.8156
Success rate/all: 25631/32960=0.7776
/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl
('Create a substance that includes V, Pb, O', ['<material><i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg88></material>', '<material><i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg1></material>', '<material><i>V<i>V<i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg1></material>', '<material><i>V<i>V<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg166></material>'], ['<material><i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg88></material>', '<material><i>V<i>V<i>V<i>V<i>Pb<i>Pb<i>Pb<i>Pb<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg88></material>', '<material><i>V

100%|██████████| 15346/15346 [00:13<00:00, 1130.21it/s]


Success: 12792, Fail: 2554, Total: 16480
Success rate/com: 12792/15346=0.8336
Success rate/all: 12792/16480=0.7762
/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl
('Create a material having a band gap value of 8.0', ['<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>L

100%|██████████| 422/422 [00:00<00:00, 1454.37it/s]


Success: 331, Fail: 91, Total: 744
Success rate/com: 331/422=0.7844
Success rate/all: 331/744=0.4449
/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl
('Create a material having a band gap value of 8.0', ['<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<sg14></material>', '<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>P<i>P<i>P<i>P<i>P<i>P<i>P<i>P<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg62></material>', '<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>

100%|██████████| 612/612 [00:00<00:00, 2244.57it/s]


Success: 524, Fail: 88, Total: 744
Success rate/com: 524/612=0.8562
Success rate/all: 524/744=0.7043
/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl
('Design a material with zero band gap', ['<material> <i>K <i>K <i>Al <i>Au <i>Cl <i>Cl <i>Cl <i>Cl <i>Cl <i>Cl <sg225> </material>', '<material> <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>F <i>F <i>F <i>F <i>F <sg6> </material>', '<material> <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Sn <i>Sn <i>Sn <i>Sn <i>Sn <i>Sn <sg11> </material>', '<material> <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <sg2> </material>', '<material> <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg

100%|██████████| 7113/7113 [00:04<00:00, 1516.22it/s]


Success: 6189, Fail: 924, Total: 9300
Success rate/com: 6189/7113=0.8701
Success rate/all: 6189/9300=0.6655
/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl
('Determine the material composition that would result in a bulk modulus of 600 GPa.', ['<material><i>Ir<i>Ir<i>Ru<i>Ru<sg194></material>', '<material><i>Ir<i>Ir<i>Ir<i>Zn<sg187></material>', '<material><i>Ir<i>Ir<i>Ir<i>Ni<sg187></material>', '<material><i>Ir<i>Ir<i>Ir<i>Th<sg139></material>'], ['<material><i>Mn<i>Mn<i>Mn<i>Mn<i>Zn<i>Zn<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg227></material>', '<material><i>Os<i>Os<i>Au<i>Au<sg67></material>', '<material><i>Pt<i>Pt<i>Pt<i>Pt<i>Pt<i>Pt<i>Re<i>Re<sg123></material>', '<material><i>N<i>N<i>Be<i>Mo<sg187></material>'])
4
complete data: 830/880=0.9432


100%|██████████| 830/830 [00:00<00:00, 7719.23it/s]


Success: 679, Fail: 151, Total: 880
Success rate/com: 679/830=0.8181
Success rate/all: 679/880=0.7716
/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl
('Determine the material composition that would result in a bulk modulus of 600 GPa.', ['<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<sg223></material>', '<material><i>Br<i>Cs<i>Cs<i>Cs<sg65></material>', '<material><i>I<i>Rb<i>Rb<sg65></material>', '<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Mo<i>Mo<sg194></material>'], ['<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<sg223></material>', '<material><i>Be<i>Be<i>Be<i>Be<i>B<i>B<i>B<i>B<sg194></material>', '<material><i>Np<i>Np<i>Np<i>Np<sg129></material>', '<material><i>I<i>Rb<i>Rb<sg191></material>'])
4
complete data: 847/880=0.9625


100%|██████████| 847/847 [00:00<00:00, 12818.98it/s]


Success: 508, Fail: 339, Total: 880
Success rate/com: 508/847=0.5998
Success rate/all: 508/880=0.5773
/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl
('Determine the material composition that would result in a optimal bulk modulus of 0 GPa.', ['<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Sr <i>Li <i>Li <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>

100%|██████████| 10733/10733 [00:00<00:00, 12601.79it/s]

Success: 6045, Fail: 4688, Total: 11000
Success rate/com: 6045/10733=0.5632
Success rate/all: 6045/11000=0.5495





In [10]:
import pickle as pkl
import re

# band gap

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl",
]


for f in files:
    print(f)
    with open(f, "rb") as f:
        data = pkl.load(f)
    print(data[0])
    print(len(data[0][-1]))
    structures = [] 
    complete_structures = []
    total = 0
    for d in data:
        inst = d[0]
        if re.search(r"(\d+\.?\d*)", inst):
            inst_bandgap = float(re.search(r"(\d+\.?\d*)", inst).group(1))
        elif re.search(r"zero", inst):
            inst_bandgap = 0
        elif re.search(r"no", inst):
            inst_bandgap = 0
        elif re.search(r"null", inst):
            inst_bandgap = 0
        elif re.search(r"without", inst):
            inst_bandgap = 0
        else:
            inst_bandgap = 0
        if inst_bandgap > 3:
            continue
        responses = list(itertools.chain(*d[1:]))
        for response in responses:
            total += 1
            groups=re.search(r"<material> ?(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", response)
            if groups:
                line = groups.group(0)
                elements = re.findall(r"[A-Z][a-z]?", line)
                complete_structures.append(elements)
                #break
    com_total = len(complete_structures)
    print(f"complete data: {com_total}/{total}={com_total/total:.4f}")
    success, fail = check_validity(complete_structures)
    #total = len(data)
    print(f"Success: {success}, Fail: {fail}, Total: {total}")
    print(f"Success rate/com: {success}/{com_total}={success/com_total:.4f}")
    print(f"Success rate/all: {success}/{total}={success/total:.4f}")

/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl
('Create a material having a band gap value of 8.0', ['<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>L

100%|██████████| 144/144 [00:00<00:00, 714.52it/s]


Success: 118, Fail: 26, Total: 172
Success rate/com: 118/144=0.8194
Success rate/all: 118/172=0.6860
/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl
('Create a material having a band gap value of 8.0', ['<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<sg14></material>', '<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>P<i>P<i>P<i>P<i>P<i>P<i>P<i>P<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg62></material>', '<material><i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Li<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>Mn<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>F<i>

100%|██████████| 159/159 [00:00<00:00, 1543.65it/s]

Success: 143, Fail: 16, Total: 172
Success rate/com: 143/159=0.8994
Success rate/all: 143/172=0.8314
/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl





('Design a material with zero band gap', ['<material> <i>K <i>K <i>Al <i>Au <i>Cl <i>Cl <i>Cl <i>Cl <i>Cl <i>Cl <sg225> </material>', '<material> <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>Fe <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>F <i>F <i>F <i>F <i>F <sg6> </material>', '<material> <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Li <i>Sn <i>Sn <i>Sn <i>Sn <i>Sn <i>Sn <sg11> </material>', '<material> <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>Ti <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <i>O <sg2> </material>', '<material> <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Cr <i>Sb <sg6> </material>', '<material> <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Pt <i>Pt <i>Pt <i>Pt <i>Pt <i>Pt <sg148> </material>', '<material> <i>In <i>Sb <sg216> </material>', '<ma

100%|██████████| 3401/3401 [00:02<00:00, 1203.79it/s]

Success: 2799, Fail: 602, Total: 4300
Success rate/com: 2799/3401=0.8230
Success rate/all: 2799/4300=0.6509





In [18]:
import pickle as pkl
import re

# bulk


files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl",
]


for f in files:
    print(f)
    with open(f, "rb") as f:
        data = pkl.load(f)
    print(data[0])
    print(len(data[0][-1]))
    structures = [] 
    complete_structures = []
    total = 0
    for d in data:
        inst = d[0]
        bulk = float(re.search(r"(\d+\.?\d*) GPa", inst).group(1))
        if bulk > 400:
            continue
        responses = list(itertools.chain(*d[1:]))
        for response in responses:
            total += 1
            groups=re.search(r"<material> ?(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", response)
            if groups:
                line = groups.group(0)
                elements = re.findall(r"[A-Z][a-z]?", line)
                complete_structures.append(elements)
                #break
    com_total = len(complete_structures)
    print(f"complete data: {com_total}/{total}={com_total/total:.4f}")
    success, fail = check_validity(complete_structures)
    #total = len(data)
    print(f"Success: {success}, Fail: {fail}, Total: {total}")
    print(f"Success rate/com: {success}/{com_total}={success/com_total:.4f}")
    print(f"Success rate/all: {success}/{total}={success/total:.4f}")
    

/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl


('Determine the material composition that would result in a bulk modulus of 600 GPa.', ['<material><i>Ir<i>Ir<i>Ru<i>Ru<sg194></material>', '<material><i>Ir<i>Ir<i>Ir<i>Zn<sg187></material>', '<material><i>Ir<i>Ir<i>Ir<i>Ni<sg187></material>', '<material><i>Ir<i>Ir<i>Ir<i>Th<sg139></material>'], ['<material><i>Mn<i>Mn<i>Mn<i>Mn<i>Zn<i>Zn<i>O<i>O<i>O<i>O<i>O<i>O<i>O<i>O<sg227></material>', '<material><i>Os<i>Os<i>Au<i>Au<sg67></material>', '<material><i>Pt<i>Pt<i>Pt<i>Pt<i>Pt<i>Pt<i>Re<i>Re<sg123></material>', '<material><i>N<i>N<i>Be<i>Mo<sg187></material>'])
4
complete data: 370/400=0.9250


100%|██████████| 370/370 [00:00<00:00, 6284.36it/s]

Success: 321, Fail: 49, Total: 400
Success rate/com: 321/370=0.8676
Success rate/all: 321/400=0.8025
/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl





('Determine the material composition that would result in a bulk modulus of 600 GPa.', ['<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<sg223></material>', '<material><i>Br<i>Cs<i>Cs<i>Cs<sg65></material>', '<material><i>I<i>Rb<i>Rb<sg65></material>', '<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Mo<i>Mo<sg194></material>'], ['<material><i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<i>Cs<sg223></material>', '<material><i>Be<i>Be<i>Be<i>Be<i>B<i>B<i>B<i>B<sg194></material>', '<material><i>Np<i>Np<i>Np<i>Np<sg129></material>', '<material><i>I<i>Rb<i>Rb<sg191></material>'])
4
complete data: 391/400=0.9775


100%|██████████| 391/391 [00:00<00:00, 9655.70it/s]

Success: 341, Fail: 50, Total: 400
Success rate/com: 341/391=0.8721
Success rate/all: 341/400=0.8525
/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl





('Determine the material composition that would result in a optimal bulk modulus of 0 GPa.', ['<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Sr <i>Li <i>Li <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>Mg <i>

100%|██████████| 4911/4911 [00:00<00:00, 9151.00it/s] 

Success: 4653, Fail: 258, Total: 5000
Success rate/com: 4653/4911=0.9475
Success rate/all: 4653/5000=0.9306





In [8]:
train_file = "/msralaphilly2/ml-la/yinxia/wu2/shared/SFM/SFM.overall.data/SFMMolInstruct.20240617/train.instruct_text2mat.tsv.shuf"
import pandas as pd
from collections import Counter
train_data = []
with open(train_file, "r") as f:
    data = pd.read_csv(f, sep="\t", header=None)
    for row in data.iterrows():
        inst = row[1][0]
        seq = row[1][1]
        if "band gap" not in inst:
            continue
        seq = seq.replace("<material>", "").replace("</material>", "").strip().split()
        elements = seq[:-1]
        sg = seq[-1]
        elements = Counter(elements)
        formula = ""
        for key in sorted(elements.keys()):
            formula += f"{key}{elements[key]}"
        formula += sg
        train_data.append(formula)
print(train_data[:10])
train_data = set(train_data)
print(len(train_data))

['Pr6Zn44<sg15>', 'B4Ti1V1<sg191>', 'Er4K4O36P8<sg14>', 'Cr2Li4Mn3O10<sg1>', 'Cd2Se1Te1<sg115>', 'B3C16H60N12O12<sg215>', 'Ag14As8Cl2Cu2O28<sg9>', 'Co4K4O8<sg1>', 'Fe1Rh2Se4<sg12>', 'Bi14O24W1<sg1>']
122831


In [2]:
import pickle as pkl
import re
import os
from collections import Counter
import itertools

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_18382/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_13130/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
]


# files = [
#     "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl",
#     "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl",
#     "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/mixtral_2585/test.bandgap_to_mat.response.pkl",
#     "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/mixtral_2585/test.bulk.response.pkl",
# ]

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bandgap_to_mat.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.comp_bgap_to_material.tsv.response.pkl",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_steps_20000/all/test.bulk.tsv.response.pkl",
]


files = [
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl.1",
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl.1",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_rp100w_steps_20000_new/all/test.bandgap_to_mat.tsv.response.pkl.1",
    "/sfmdataeastus2/nlm/zekun/instruct/base8b/instruct_task_20240807/8b_dialogue_1v1_bs2048_rp100w_steps_20000_new/all/test.bulk.tsv.response.pkl.1"
]

files = [
    "/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000_add/all/test.bulk.tsv.response.pkl"
]

def check_validity(structure):
    comp, count = [], []
    temp = Counter(structure)
    for k, v in temp.items():
        comp.append(k)
        count.append(v)
    count = np.array(count)
    try:
        count = count / np.gcd.reduce(count)
        count = count.astype(int)
        if smact_validity(tuple(comp), tuple(count), use_element_symbol=True):
            return True
        else:
            return False
    except Exception as e:
        return False

for fname in files:
    print(fname)
    with open(fname, "rb") as fr:
        data = pkl.load(fr)
    print(data[0])
    structures = []
    instructions = []
    num_responses = 0
    for d in data:
        instruction = d[0]
        responses = list(itertools.chain(*d[1:]))
        num_responses += len(responses)
        for response in responses:
            groups=re.search(r"<material> ?(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", response)
            if groups:
                line = groups.group(0)
                elements = re.findall(r"[A-Z][a-z]?", line)
                sg = re.search(r"<sg\d+>", line).group(0)
                temp = Counter(elements)
                formula = ""
                for key in sorted(temp.keys()):
                    formula += f"{key}{temp[key]}"
                formula += sg
                #if formula not in train_data and check_validity(elements):
                if check_validity(elements):
                    instructions.append(instruction)
                    structures.append(elements+[sg])
    out_fname = fname.replace(
        "/sfmdataeastus2/nlm/zekun/instruct",
        "/msralaphilly2/ml-la/renqian/SFM/threedimargen/data/materials_data/instruct"
    ).replace(".pkl", ".valid.txt")
    if not os.path.exists(os.path.dirname(out_fname)):
        os.makedirs(os.path.dirname(out_fname), exist_ok=True)

    print(f"Novel and Valid: {len(structures)}/{num_responses}={len(structures)/(num_responses):.4f}")
    print(f"Output to {out_fname}")

    with open(out_fname.replace(".valid.txt", ".valid.inst.txt"), "w") as fw:
        for i in instructions:
            fw.write(i+"\n")
    with open(out_fname, "w") as fw:
        for s in structures:
            fw.write(" ".join(s)+"\n")

/sfmdataeastus2/nlm/zekun/instruct/base1b/instruct_task_20240807/1b_dialogue_1v1_bs2048_steps_20000_add/all/test.bulk.tsv.response.pkl
('Determine the material composition that would result in a bulk modulus of 600 GPa.', [], ['<material><i>Os<i>Os<i>Pd<sg164></material>', '<material><i>Ru<i>Ru<i>Ru<i>Ni<sg25></material>', '<material><i>Au<sg229></material>', '<material><i>Ru<i>Ru<i>Mg<i>Os<sg225></material>', '<material><i>Ir<i>Ni<i>Ga<i>Ga<sg225></material>', '<material><i>Ir<i>Ru<i>Ru<i>Sr<sg166></material>', '<material><i>Ir<i>Ir<i>Ir<i>Pr<sg139></material>', '<material><i>V<i>V<i>V<i>V<i>V<i>V<i>As<i>As<i>As<i>As<sg38></material>', '<material><i>Mo<i>Mo<i>Mo<i>Sn<sg71></material>', '<material><i>Si<i>Si<sg227></material>', '<material><i>Os<i>Os<i>Co<i>Co<sg194></material>', '<material><i>Ir<i>Ir<i>Ni<i>Ni<sg156></material>', '<material><i>B<i>Ta<i>Ta<i>Au<sg166></material>', '<material><i>Be<i>Re<i>B<sg216></material>', '<material><i>Fe<i>Fe<i>Si<i>Ge<sg225></material>', '<materia

In [21]:
files = [
    "/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material_new.txt"
]

def check_validity(structure):
    comp, count = [], []
    temp = Counter(structure)
    for k, v in temp.items():
        comp.append(k)
        count.append(v)
    count = np.array(count)
    try:
        count = count / np.gcd.reduce(count)
        count = count.astype(int)
        if smact_validity(tuple(comp), tuple(count), use_element_symbol=True):
            return True
        else:
            return False
    except Exception as e:
        return False

for fname in files:
    print(fname)
    with open(fname, "r") as fr:
        data = fr.readlines()
    print(data[0])
    structures = []
    for d in data:
        groups=re.search(r"(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", d)
        if groups:
            line = groups.group(0)
            elements = re.findall(r"[A-Z][a-z]?", line)
            sg = re.search(r"<sg\d+>", line).group(0)
            temp = Counter(elements)
            formula = ""
            for key in sorted(temp.keys()):
                formula += f"{key}{temp[key]}"
            formula += sg
            if check_validity(elements):
                structures.append(elements+[sg])
    out_fname = fname.replace(
        "/sfmdataeastus2/nlm/zekun/instruct",
        "/msralaphilly2/ml-la/renqian/SFM/threedimargen/data/materials_data/instruct"
    ).replace(".txt", ".valid.txt")
    if not os.path.exists(os.path.dirname(out_fname)):
        os.makedirs(os.path.dirname(out_fname), exist_ok=True)

    print(f"Valid: {len(structures)}/{len(data)}={len(structures)/len(data):.4f}")
    print(f"Output to {out_fname}")

    with open(out_fname, "w") as fw:
        for s in structures:
            fw.write(" ".join(s)+"\n")

/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/uncondition/1_material_new.txt
<i>Al <i>Fe <i>Fe <i>Ho <sg225> </material> </s>

Valid: 5798/32960=0.1759
Output to /msralaphilly2/ml-la/renqian/SFM/threedimargen/data/materials_data/instruct/8x7b/vllm/uncondition/1_material_new.valid.txt


In [7]:
for fname in files:
    print(fname)
    novel = 0
    with open(fname, "rb") as fr:
        data = pkl.load(fr)
    print(data[0])
    structures = []
    instructions = []
    num_responses = 0
    for d in data:
        instruction = d[0]
        responses = list(itertools.chain(*d[1:]))
        num_responses += len(responses)
        for response in responses:
            groups=re.search(r"<material> ?(<i>[A-Z][a-z]? ?)+<sg\d+> ?</material>", response)
            if groups:
                line = groups.group(0)
                elements = re.findall(r"[A-Z][a-z]?", line)
                sg = re.search(r"<sg\d+>", line).group(0)
                elements = Counter(elements)
                formula = ""
                for key in sorted(elements.keys()):
                    formula += f"{key}{elements[key]}"
                formula += sg
                if formula not in train_data:
                    novel += 1
    print(f"Novel: {novel}/{num_responses}={novel/(num_responses):.4f}")

/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bandgap_to_mat.response.pkl.1


('Design a material with zero band gap', ['<material> <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>Sr <i>S

In [5]:
import pickle as pkl
with open("/sfmdataeastus2/nlm/zekun/instruct/8x7b/vllm/global_step7755_restore_new/test.bulk.response.pkl.1", "rb") as f:
    data = pkl.load(f)


['<material> <i>Sr <i>Li <i>Li <i>Tl <sg71> </material>', '<material> <i>Sr <i>Sr <i>Li <i>Pb <sg71> </material>', '<material> <i>Ba <i>Ba <i>Mg <i>In <sg71> </material>', '<material> <i>Sr <i>Sr <i>Li <i>Zn <sg71> </material>']
