In [15]:
#%%
import pandas as pd
import sys
import json
from pymatgen.core import Composition
from pymatgen.core.periodic_table import get_el_sp
import time
from datetime import datetime
from pymongo import MongoClient
from zoneinfo import ZoneInfo
import DataTools.compositionVector as cd2cv
# import fire
from pprint import pprint
from typing import List


In [16]:
#%% Process name unifier

def processNameUnifier(s: str):
    exceptions = []

    if s in exceptions:
        return s
    elif s.isupper():
        return s
    else:
        return s.lower()

In [17]:
#%% Processes processing string into a unified-form process list

def processStr2list(s):
    ls = []
    s = s.replace(' ','')
    tempLs = list(s.split('+'))
    for process in tempLs:
        if process[0].isdigit():
            for i in range(int(process[0])):
                ls.append(processNameUnifier(process[1:]))
        else:
            ls.append(processNameUnifier(process))
    if ls.__len__()>0:
        return [ls, ls.__len__()]
    else:
        return []

In [18]:
#%% Unifies phase names in the database
# If composition -> keep as is
# if all uppercase (e.g. BCC, FCC) -> keep as is
# otherwise -> make all lowercase

def phaseNameUnifier(s):
    exceptionToUpper = ['b0', 'b1', 'b2', 'a0', 'a1', 'a2']
    replaceDict = {'bulkmetallic\nglass' : 'amorphous', 'bcc' : 'BCC', 'fcc' : 'FCC', 'LAVES' : 'laves'}

    try:
        isComp = Composition(s).valid
    except Exception as e:
        isComp = False

    if s in exceptionToUpper:
        return s.upper()
    elif s in replaceDict:
        return replaceDict[s]
    elif isComp:
        return s
    elif s.isupper():
        return s
    else:
        return s.lower()

In [19]:
#%% Transforms the structure string into a list of
# individual phases, interpreting (1) multiple phases
# of the same type, (2) composition-defined phases, and
# (3) named phases. Processes them in a unified way.

def structStr2list(s: str):
    ls = []

    s = s.replace(' ','')
    tempLs = list(s.split('+'))
    for phase in tempLs:
        if phase[0].isdigit():
            for i in range(int(phase[0])):
                ls.append(phaseNameUnifier(phase[1:]))
        else:
            ls.append(phaseNameUnifier(phase))
    ls.sort()
    if ls.__len__()>0:
        return [ls, ls.__len__()]
    else:
        return []

In [20]:
#%% Modify composition string from the template into a unified
# representation of (1) IUPAC standardized formula, (2) pymatgen dictionary
# composition object, (3) anonymized formula, (4) reduced formula, (5) chemical system,
# and (6) number of components

def percentileFormula(cd: dict):
    order = sorted(cd.keys(), key=lambda s: get_el_sp(s).iupac_ordering)
    return ' '.join([f'{el}{round(100*cd[el], 1):g}' for el in order])

def relationalFormula(cd: dict):
    order = sorted(cd.keys(), key=lambda s: get_el_sp(s).iupac_ordering)
    lowest = min([v for v in cd.values() if v>0.005])
    return ' '.join([f'{el}{round(cd[el]/lowest, 2):g}' for el in order])

def compStr2compList(s: str):
    try:
        compObj = Composition(s).reduced_composition
        if not compObj.valid:
            print("Composition invalid")
        return [compObj.iupac_formula,
                dict(compObj.fractional_composition.as_dict()),
                percentileFormula(dict(compObj.fractional_composition.as_dict())),
                relationalFormula(dict(compObj.fractional_composition.as_dict())),
                compObj.anonymized_formula,
                compObj.reduced_formula,
                compObj.chemical_system,
                compObj.chemical_system.split('-'),
                compObj.__len__()]
    except Exception as e:
        print(e)
        
        raise ValueError("Warning! Can't parse composition!: "+s)

In [21]:
#%% Convert a pair of metadata and data into ULTERA Database datapoint
def datapoint2entry(dataP, printOuts=True):
    entry = {'marker': {}, 'material' : {}, 'property' : {}, 'reference' : {}}

    # composition
    try:
        compList = compStr2compList(dataP['Composition'])
    except Exception as e:
        print(str(e))
        compList = 
        raise ValueError("Could not parse the composition! Required for upload. Aborting upload!")
    
    # material
    entry['material'].update({
            'rawFormula': dataP['Composition'],
            'formula': compList[0],
            'compositionDictionary' : compList[1],
            'percentileFormula': compList[2],
            'relationalFormula': compList[3],
            'compositionVector': cd2cv.compDict2Vec(compList[1]),
            'anonymizedFormula' : compList[4],
            'reducedFormula' : compList[5],
            'system' : compList[6],
            'elements' : compList[7],
            'nComponents' : compList[8]})

    # structure
    if 'Structure' in dataP:
        if dataP['Structure'] is not None:
            structList = structStr2list(dataP['Structure'])
            entry['material'].update({
                'structure': structList[0],
                'nPhases': structList[1]})
        else:
            if printOuts:
                print('No structure data!')

    # processing
    if 'Processing' in dataP:
        if dataP['Processing'] is not None:
            processingList = processStr2list(dataP['Processing'])
            entry['material'].update({
                    'processes' : processingList[0],
                    'nProcessSteps' : processingList[1]})
        else:
            if printOuts:
                print('No process data!')

    # comment
    if 'Material Comment' in dataP:
        if dataP['Material Comment'] is not None:
            entry['material'].update({
                    'comment' : dataP['Material Comment']})

    if 'Temperature [K]' in dataP:
        # If there is temperature reported, regardless of property report, note the temperature of material
        if dataP['Temperature [K]'] is not None:
            entry['material'].update({
                'observationTemperature': float(dataP['Temperature [K]'])})

    if 'Name' in dataP:
        # Requires: Name and Value
        if dataP['Name'] is not None:
            entry['property'].update({
                'name' : dataP['Name'],
                'value': float(dataP['Value [SI]'])})

            # If property has a name, go through all of property parameters and value
            if 'Source' in dataP:
                if dataP['Source'] is not None:
                    entry['property'].update({
                        'source': dataP['Source']})
            if 'Property Parameters' in dataP:
                if dataP['Property Parameters'] is not None:
                    entry['property'].update({
                        'parameters': dataP['Property Parameters']})
            if 'Temperature [K]' in dataP:
                if dataP['Temperature [K]'] is not None:
                    entry['property'].update({
                        'temperature': float(dataP['Temperature [K]'])})
            if 'Unit [SI]' in dataP:
                if dataP['Unit [SI]'] is not None:
                    entry['property'].update({
                        'unitName': dataP['Unit [SI]']})
        else:
            del entry['property']
            if printOuts:
                print('No property data or error occurred!')

    if 'DOI' in dataP:
        # Requires DOI
        if 'DOI' in dataP:
            if dataP['DOI'] is not None:
                entry['reference'].update({
                        'doi' : dataP['DOI']})
                if 'Pointer' in dataP:
                    if dataP['Pointer'] is not None:
                        entry['reference'].update({
                            'pointer': dataP['Pointer']})
            else:
                del entry['reference']
                if printOuts:
                    print('No reference data!')

    return entry
    
# if __name__ == "__main__":
#     fire.Fire(upload)

SyntaxError: invalid syntax (1412350164.py, line 11)

In [None]:
#%% function to generate 2d array of [[raw formula],[percentile formula],[property name],[property value]]
# for the materials in dataset

def arrayGenerator(parsed):
    dataset = []
    prop_values = []
    prop_names = []
    raw_formulas = []
    percentile_formulas = []
    validations = []
    markers = []

    for datapoint in parsed:
        if (datapoint2entry(datapoint)):
            dataset.append[datapoint2entry(datapoint)]
        else:
            print(str(e))
            failed_dataset.append(datapoint)
            markers.append('🔴')
            pass
            # raise ValueError("Could not parse the composition! Required for upload. Aborting upload!")

    # try:
    #     dataset = [datapoint2entry(datapoint) for datapoint in parsed]
    # except Exception as e:
    #     print(str(e))
    #     markers.append('🔴')
    #     # raise ValueError("Could not parse the composition! Required for upload. Aborting upload!")

    # for datapoint in parsed:
    #     try:
    #         dataset = [datapoint2entry(datapoint)]
    #     except Exception as e:
    #         print(str(e))
    #         failed_data.append(datapoint)
    #         markers.append('🔴')
    #         # pass
    #         # raise ValueError("Could not parse the composition! Required for upload. Aborting upload!")

    def adjustLen(strArray: List[str]) -> List[str]:
        max_str_length = max([len(f) for f in strArray])
        return [f'{formula:<{max_str_length}}' for formula in strArray]

    for data in dataset:
        # bool_markers.append(int(data['marker']['boolMarker']))
        if 'material' in data:
            raw_formulas.append(data['material']['rawFormula'])
            percentile_formulas.append(data['material']['percentileFormula'])
        else:
            raw_formulas.append('')
            percentile_formulas.append('')
        if 'property' in data:
            prop_names.append(data['property']['name'])
            prop_values.append(data['property']['value'])
        else:
            prop_names.append('')
            prop_values.append(0)
    
    raw_formulas = adjustLen(raw_formulas)
    percentile_formulas = adjustLen(percentile_formulas)

    property_condition = 'UTS'
    value_condition = 6e8

    for prop, value in zip(prop_names, prop_values):
        # if bool_marker == False:
        #     validations.append('')
        #     markers.append('🔴')
        if value >= value_condition and prop == property_condition:
            validations.append(f"High {property_condition} value is {value}")
            markers.append('🟠')
        else:
            validations.append('')
            markers.append('🟢')

    return [
        f'- {marker}  {raw_formula} | {percentile_formula} | {prop_name} | {validation} \n'
        for marker, raw_formula, percentile_formula, prop_name, validation 
        in zip(markers, raw_formulas, percentile_formulas, prop_names, validations)
    ]



In [None]:
datasheet = 'template_v4_DatasetExample.xlsx'
isDatabase = False

# get timestamp
dateString = datetime.now().strftime('%Y-%d-%b-%H-%M')

### Logging progress into Markdown file
MdLogger = open('PyQAlloyReport'+dateString+'.md', "w")
MdLogger.write('\n# PyQAlloyReport '+dateString+'\n\n')
MdLogger.write('**Legend:** \n\n🟢 Successful Upload / 🟠 Abnormal Upload / 🔴 Failed Upload\n\n')
MdLogger.write('## Composition --> Result \n')

# Import data
print('\nImporting data.')
df2 = pd.read_excel(datasheet, usecols="A:N", nrows=5000, skiprows=8)
result = df2.to_json(orient="records")
parsed = json.loads(result, strict=False)
print('Imported '+str(parsed.__len__())+' datapoints.\n')

## Convert data into database datapoints and upload
results = arrayGenerator(parsed)
for result in results:
    MdLogger.write(result)


Importing data.
Imported 22 datapoints.

No reference data!
No property data or error occurred!
No property data or error occurred!


In [None]:
# raw_formulas = ['1', '1000', '100']

# max_str_length = max([len(f) for f in raw_formulas])

# prforms = [f'{formula:<{max_str_length}}' for formula in raw_formulas]
# print(prforms)

In [None]:
# test for finding max str length from list

# test_list = ['1', '100', '1000', '10000']

# str_lengths = [len(s) for s in test_list]
# longest_str_length = (max(str_lengths))

# print(longest_str_length)

In [None]:
# function to check if property values of materials in dataset meet a certain condition
# ex: checking if the material property given is UTS, and if yes, then checking if UTS value > 6e8

# property = 'UTS'
# value_condition = 6e8

# def propertyChecker(uploadEntry):
#     mydata = []
#     prop_values = []
#     mat_formulas = []
#     mydata.append(uploadEntry)
#     for data in mydata:
#         for category, subcategory in data.items():
#             if category == 'property':
#                 if subcategory['name'] == property and subcategory['value'] >= value_condition:
#                     prop_values.append(subcategory['value'])
#                     mat = data['material']
#                     formula = mat['formula']
#                     mat_formulas.append(formula)
#                     return True
#                 else:
#                     return False
                

In [None]:
# mydata = []
# prop_values = []
# mat_formulas = []
# for data in mydata:
#     for category, subcategory in data.items():
#         if category == 'property':
#             if subcategory['name'] == property and subcategory['value'] >= value_condition:
#                 prop_values.append(subcategory['value'])
#                 mat = data['material']
#                 formula = mat['formula']
#                 mat_formulas.append(formula)

In [None]:
# datasheet = 'template_v4_DatasetExample.xlsx'
# isDatabase = False

# # Import metadata
# print('Reading the metadata.')
# metaDF = pd.read_excel(datasheet, usecols="A:F", nrows=4)
# meta = metaDF.to_json(orient="split")
# metaParsed = json.loads(meta, strict=False)['data']

# # get timestamp
# dateString = datetime.now().strftime('%Y-%d-%b-%H-%M')

# # Format metadata into a dictionary
# metaData = {
#     'source': 'LIT',
#     'name': metaParsed[0][1],
#     'email': metaParsed[1][1],
#     'directFetch': metaParsed[2][1],
#     'handFetch': metaParsed[3][1],
#     'comment': metaParsed[0][5],
#     'timeStamp': datetime.now(ZoneInfo('America/New_York')),
#     'dataSheetName': datasheet
# }

# if isDatabase:
#     metaData.update({'parentDatabase': target})
#     metaData.update({'handFetch': False})

# print('Data credited to: '+metaParsed[0][1])
# print('Contact email: '+metaParsed[1][1])
# pprint(metaData)

# ### Logging progress into Markdown file
# MdLogger = open('PyQAlloyReport'+dateString+'.md', "w")
# MdLogger.write('\n#PyQAlloyReport '+dateString+'\n\n')
# MdLogger.write('**Legend:** \n\n🟢 Successful Upload / 🟠 Abnormal Upload / 🔴 Failed Upload\n\n')
# MdLogger.write('## Composition --> Result \n')

# # Import data
# print('\nImporting data.')
# df2 = pd.read_excel(datasheet, usecols="A:N", nrows=5000, skiprows=8)
# result = df2.to_json(orient="records")
# parsed = json.loads(result, strict=False)
# print('Imported '+str(parsed.__len__())+' datapoints.\n')

# # Convert metadata and data into database datapoints and upload
# for datapoint in parsed:
#     comp = datapoint['Composition'].replace(' ','')
#     print('Processing: '+comp)
#     try:
#         uploadEntry = datapoint2entry(metaData, datapoint)
#         MdLogger.write('- 🟢 '+comp+' --> Success!')
#         propertyCheck = propertyChecker(uploadEntry)
#         if propertyCheck == True:
#             MdLogger.write(' Hey! High '+property+ '!!!')
#         MdLogger.write('\n')

#         matArray = arrayGenerator(uploadEntry)

#     except ValueError as e:
#         exceptionMessage = str(e)
#         print(exceptionMessage)
#         MdLogger.write('- 🔴 '+comp+' --> Fail! <------- '+exceptionMessage+'\n')
#         print('Upload failed!\n')
#         pass
# MdLogger.close()

# # matArray = arrayGenerator(uploadEntry)
# pprint(matArray)

In [None]:
# mydata = []
# mydata.append(uploadEntry)
# pprint(mydata)

In [None]:
# mydata = []
# prop_values = []
# prop_names = []
# raw_formulas = []
# prcnt_formulas = []
# uploadEntry = datapoint2entry(metaData, datapoint)

# for data in uploadEntry:
# mydata.append(uploadEntry)

# pprint(mydata)

# for data in mydata:
#     for category, subcategory in data.items():
#         for subcategory, supsubcategory in category.items():
#         # if category == 'property':
#             # if subcategory['name'] == property and subcategory['value'] >= value_condition:
#         # prop_values.append(subcategory['value'])
#             mat = category['material']
#             raw_formula = mat['rawFormula']
#             prcnt_formula = mat['percentileFormula']
#             raw_formulas.append(raw_formula)
#             prcnt_formulas.append(prcnt_formula)
#             prop = category['property']
#             prop_name = prop['name']
#             prop_val = prop['value']
#             prop_values.append(prop_val)
#             prop_names.append(prop_name)

# pprint(raw_formulas)
# pprint(prcnt_formulas)
# pprint(prop_names)
# pprint(prop_values)

In [None]:
# mydata = []
# prop_values = []
# prop_names = []
# raw_formulas = []
# percent_formulas = []
# mydata.append(uploadEntry)
# for data in mydata:
#     for section, parameter in data.items():
#         if section == 'material':
#             raw_formulas.append(parameter['rawFormula'])
#             percent_formulas.append(parameter['percentileFormula'])
#         if section == 'property':
#             prop_names.append(parameter['name'])
#             prop_values.append(parameter['value'])


In [None]:
# pprint(mydata[0])

In [None]:
# # iterate thru uploadEntry (appended to mydata list), access entries for property, 
# # make into list, tell if UTS greater than 600 Mpa = 6e8 Pa

# props = []
# mats = []
# i = 0
# for data in mydata:
#     for category, subcategory in data.items():
#         i+=1
#         if category == 'property':
#             if subcategory['name'] == 'UTS' and subcategory['value'] >= 6e8:
#                 props.append(subcategory['value'])
#                 mat = data.get('material')
#                 formula = mat.get('formula')
#                 mats.append(formula)           

# valid_mat_props = {mats[i]: props[i] for i in range(len(mats))}

# print(props)
# print(mats)
# print(valid_mat_props)


In [None]:
### duplicate code block 9/21 ###

# datasheet = 'template_v4_DatasetExample.xlsx'
# isDatabase = False

# #Import metadata
# print('Reading the metadata.')
# metaDF = pd.read_excel(datasheet, usecols="A:F", nrows=4)
# meta = metaDF.to_json(orient="split")
# metaParsed = json.loads(meta, strict=False)['data']

# # get timestamp
# dateString = datetime.now().strftime('%Y-%d-%b-%H-%M')

# # Format metadata into a dictionary
# metaData = {
#     'source': 'LIT',
#     'name': metaParsed[0][1],
#     'email': metaParsed[1][1],
#     'directFetch': metaParsed[2][1],
#     'handFetch': metaParsed[3][1],
#     'comment': metaParsed[0][5],
#     'timeStamp': datetime.now(ZoneInfo('America/New_York')),
#     'dataSheetName': datasheet
# }

# if isDatabase:
#     metaData.update({'parentDatabase': target})
#     metaData.update({'handFetch': False})

# print('Data credited to: '+metaParsed[0][1])
# print('Contact email: '+metaParsed[1][1])
# pprint(metaData)

# # Logging progress into a CSV table
# # dataFileName = datasheet.replace('.xls', '').replace('.xlsx', '')
# # logger = open(dataFileName+'_REPORT_'+dateString+'.csv', "w")
# # logger.write('Composition, Result\n')

# ### Logging progress into Markdown file
# MdLogger = open('PyQAlloyReport_'+dateString+'.md', "w")
# MdLogger.write('Composition, Result\n')

# # Import data
# print('\nImporting data.')
# df2 = pd.read_excel(datasheet, usecols="A:N", nrows=5000, skiprows=8)
# result = df2.to_json(orient="records")
# parsed = json.loads(result, strict=False)
# print('Imported '+str(parsed.__len__())+' datapoints.\n')

# # Convert metadata and data into database datapoints and upload
# for datapoint in parsed:
#     comp = datapoint['Composition'].replace(' ','')
#     print('Processing: '+comp)
#     try:
#         uploadEntry = datapoint2entry(metaData, datapoint)
#         MdLogger.write(comp+' ,Success! ')
#         propertyCheck = propertyChecker(uploadEntry)
#         if propertyCheck == True:
#             MdLogger.write('Hey! High '+property+ '!!!')
#         MdLogger.write('\n')

#     except ValueError as e:
#         exceptionMessage = str(e)
#         print(exceptionMessage)
#         MdLogger.write(comp + ',Fail!,<-------,'+exceptionMessage+'\n')
#         # print('Upload failed!\n')
#         pass
# MdLogger.close()


In [None]:
# f-string example: generate 100 comps, format left, right, center

# comps = []

# for i in range(0,100):
#     comp = Composition({'Cr':i,'Ni':(100-i)})
#     comps.append(comp.formula)

# print(comps)

# lengths = [len(comp) for comp in comps]
# max_str_len = max(lengths)
# formatting_len = max_str_len + 6

# print(max_str_len)

# # left aligned
# for comp in comps:
#     pprint(f'{comp:<{formatting_len}}')

# # right aligned
# for comp in comps:
#     pprint(f'{comp:>{formatting_len}}')

# # center aligned
# for comp in comps:
#     pprint(f'{comp:^{formatting_len}}')
