In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from io import StringIO
from ase.io import read

from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.analysis.diffraction import xrd
from pymatgen.core.spectrum import Spectrum

In [2]:
df = pd.read_csv('./data/06_2_drop_duplicate.csv')
df

Unnamed: 0,sub_comp,mpid,reduced_formula,index,pretty_formula,composition,lower_bound,upper_bound,total_range,roost_ensemble_mean,e_hull_from_roost,roost_band_gap,sub_structure,spacegroup,wyckoffs,species
0,"['Li', 'S', 'Ge', 1.0, 1.0, 2.0]",mp-1001784,LiGe2S,16,LiGe2S,Li1Ge2S1,1.620245,1.88217,0.261925,-0.573515,0.149234,0.851359,Li1 Ge2 S1\n1.0\n3.357416 0.000000 5.490541\n1...,166,1_2_3,16_3_32
1,"['Li', 'S', 'Ge', 1.0, 1.0, 2.0]",mp-1025496,LiGe2S,240,LiGe2S,Li1Ge2S1,1.620245,1.88217,0.261925,-0.573515,0.149234,0.851359,Li2 Ge4 S2\n1.0\n3.529631 0.000000 0.000000\n-...,194,1_2_6,3_16_32
2,"['Li', 'S', 'Ge', 1.0, 1.0, 2.0]",mp-1029395,LiGe2S,352,LiGe2S,Li1Ge2S1,1.620245,1.88217,0.261925,-0.573515,0.149234,0.851359,Li2 Ge4 S2\n1.0\n4.160491 0.000000 -3.340280\n...,122,1_2_4,16_3_32
3,"['Li', 'S', 'As', 1.0, 1.0, 2.0]",mp-1001784,LiAs2S,17,LiAs2S,Li1As2S1,1.84265,1.89759,0.05494,-0.588746,0.078403,0.887471,Li1 As2 S1\n1.0\n3.357416 0.000000 5.490541\n1...,166,1_2_3,16_3_33
4,"['Li', 'S', 'As', 1.0, 1.0, 2.0]",mp-1025496,LiAs2S,241,LiAs2S,Li1As2S1,1.84265,1.89759,0.05494,-0.588746,0.078403,0.887471,Li2 As4 S2\n1.0\n3.529631 0.000000 0.000000\n-...,194,1_2_6,3_16_33
5,"['Li', 'S', 'As', 1.0, 1.0, 2.0]",mp-1029395,LiAs2S,353,LiAs2S,Li1As2S1,1.84265,1.89759,0.05494,-0.588746,0.078403,0.887471,Li2 As4 S2\n1.0\n4.160491 0.000000 -3.340280\n...,122,1_2_4,16_3_33
6,"['Li', 'B', 'S', 1.0, 1.0, 2.0]",mp-1001784,LiBS2,29,LiBS2,Li1B1S2,1.606668,2.023678,0.41701,-1.048681,0.012006,4.340585,Li1 B1 S2\n1.0\n3.357416 0.000000 5.490541\n1....,166,1_2_3,5_3_16
7,"['Li', 'B', 'S', 1.0, 1.0, 2.0]",mp-1025496,LiBS2,253,LiBS2,Li1B1S2,1.606668,2.023678,0.41701,-1.048681,0.012006,4.340585,Li2 B2 S4\n1.0\n3.529631 0.000000 0.000000\n-1...,194,1_2_6,3_5_16
8,"['Li', 'B', 'S', 1.0, 1.0, 2.0]",mp-1029395,LiBS2,365,LiBS2,Li1B1S2,1.606668,2.023678,0.41701,-1.048681,0.012006,4.340585,Li2 B2 S4\n1.0\n4.160491 0.000000 -3.340280\n-...,122,1_2_4,5_3_16
9,"['Li', 'Al', 'S', 1.0, 1.0, 2.0]",mp-1001784,LiAlS2,36,LiAlS2,Li1Al1S2,0.96556,2.671238,1.705677,-1.547465,-0.016013,4.11844,Li1 Al1 S2\n1.0\n3.357416 0.000000 5.490541\n1...,166,1_2_3,13_3_16


# Get modified-XRD data and processing

In [3]:
mxrd = []
for ii in tqdm(range(len(df))):
    atoms = read(StringIO(df['sub_structure'][ii]),format='vasp')
    anions = ['O','F','S','Cl','Se','Br','I','Te'] # except 16, 17 group

    if len(set(atoms.get_chemical_symbols())) == 2:
        del atoms[[atom.index for atom in atoms if atom.symbol == 'Li']]
    else:
        del atoms[[atom.index for atom in atoms if not (atom.symbol in anions)]]

    for i in range(len(atoms)):
        atoms[i].symbol = 'S'
    v_offset = (atoms.get_global_number_of_atoms()*40)/atoms.cell.volume
    c_offset = (v_offset**(1/3))

    new_cell=atoms.cell*c_offset

    atoms.set_cell(new_cell,scale_atoms=True)

    struc = AseAtomsAdaptor.get_structure(atoms)
    pa=xrd.XRDCalculator(wavelength='CrKb1').get_pattern(structure=struc,scaled=True,two_theta_range=(0,89.98))

    threshold = 0
    psd_idxs = pa.y > threshold #array of 0 and 1
    pay = pa.y * psd_idxs #zero out all the unnecessary powers

    ra = [round(x,1) for x in np.arange(0,91,0.1)]
    # ra2 = [round(x,2) for x in np.arange(0,90.1,0.5)]
    ry = [0 for _ in (ra)]

    rdf = pd.DataFrame(columns=ra)
    rdf.loc[0]=(ry)
    rdf

    rrdf = rdf[:]
    rx = [round(i,1) for i in pa.x]
    for i in range(len(rx)):    
        s=rx[i]
        rrdf[s] = max(rrdf[s][0],pay[i])

    spec = Spectrum(ra[:900],list(rrdf.loc[0])[:900])


    ssp = spec
    ssp.smear(0.2,'gaussian')
    ssp.normalize('sum',10)
    mxrd.append(list(ssp.y))

df['mxrd']=mxrd
df.to_csv('./data/08_1_bulk_mxrd.csv',index=False)

  0%|          | 0/21 [00:00<?, ?it/s]

In [4]:
df_new_only_mxrd = pd.read_csv('./data/08_1_bulk_mxrd.csv')[['composition','mxrd','mpid']]
df_ref = pd.read_csv('./data/07_3_mxrd_from_ref.csv')[['formula_id','r_xrd','cond']]
df_ref.columns = ['composition','mxrd','conductivity']

In [5]:
df_new_only_mxrd

Unnamed: 0,composition,mxrd,mpid
0,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784
1,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1025496
2,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1029395
3,Li1As2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784
4,Li1As2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1025496
5,Li1As2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1029395
6,Li1B1S2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784
7,Li1B1S2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1025496
8,Li1B1S2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1029395
9,Li1Al1S2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784


In [6]:
df_new_and_old = pd.concat([df_new_only_mxrd,df_ref]).reset_index()
df_new_and_old.drop(['index'],axis=1,inplace=True)
df_new_and_old

Unnamed: 0,composition,mxrd,mpid,conductivity
0,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784,
1,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1025496,
2,Li1Ge2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1029395,
3,Li1As2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1001784,
4,Li1As2S1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",mp-1025496,
...,...,...,...,...
544,B1Li1O14S4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
545,C3F9Li1O9Rb2S3,"[0.0, 0.0, 1.92979e-318, 2.4723075750492e-310,...",,
546,B7Li3O12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,
547,B11Li3O18,"[6.051556561682696e-169, 1.3000318180921978e-1...",,


In [7]:
df_new_and_old.to_csv('./data/08_2_bulk_mxrd_merged.csv',index=False)