In [750]:
import json
import re
import pandas as pd
import numpy as np

from copy import deepcopy
from nltk.corpus import wordnet

* formula_y should be used -> formula_x -> chemical formula
* merge density_y -> measured density (any)
* specific_gravity_new -> specific gravity (any)
* fracture_x -> fracture_y
* ima_staus_y (expand) -> ima_status_x -> ima status
* check localities (locality, type locality, co-type localities, first recorded locality )
* merge axial ratio and ratio
* merge streak_x and streak_y
* merge transparency and diaphenity
* z is number of molecules

* if we space group, then crystal system_y

In [751]:
def get_synonyms(word):
    syns = [lm.name() for syn in wordnet.synsets(word) for lm in syn.lemmas()]
    return set(syns)

In [752]:
def remove_synonyms(df):
    done_with = set()
    for col in df.columns:
        for syn in get_synonyms(col):
            if syn != col and syn in df.columns and syn not in done_with:
                print(f"{col} has synonym {syn}")
                df[col] = df[col].combine_first(df[syn])
                df = df.drop(columns=[syn])
                done_with.update([syn, col])
    return df

In [753]:
def handle_2v(x):
    y = x.lower().split(',')[:2]
    y += [np.nan] * (2 - len(y))
    if 'measured' not in y[0]:
        y[1], y[0] = y[0], y[1]
    for i in range(2):
        if pd.notna(y[i]):
            y[i] = re.sub(r"\(.*\)", '', y[i].split(':')[1]).strip()
    #     print(y)
    return '|'.join(str(x) for x in y)

In [754]:
def handle_density_x(x):
    y = re.split(r'\s{2,}', x.lower())[:2]
    y += [np.nan] * (2 - len(y))
    if 'measured' not in y[0]:
        y[1], y[0] = y[0], y[1]
    for i in range(2):
        if pd.notna(y[i]):
            y[i] = re.sub(r"\(.*\)", '', y[i]).strip()
    #     print(y)
    return '|'.join(str(x) for x in y)


In [755]:
def handle_hardness(x):
    x = x.lower().replace('\'', '')
    x = x[1:-1].split(',') if x[0] == '[' else x
    y = [0, 0] # mohs, vickers
    if isinstance(x, str):
        if 'mohs' in x:
            y = [x.replace('on', '').replace('scale', '').replace('mohs', ''), np.nan]
        else:
            y = [np.nan, x.replace('vickers', '').replace('-', '')]
    else:
        x_m = [z for z in x if 'mohs' in z]
        x_v = [z for z in x if 'vickers' in z]
        y[0] = x_m[0].replace('on', '').replace('scale', '').replace('mohs', '') if len(x_m) > 0 else np.nan
        y[1] = x_v[0].replace('vickers', '').replace('-', '') if len(x_v) > 0 else np.nan
    return '|'.join(str(x) for x in y)
    

In [756]:
def lowerUpperSplit(x):
    z = []
    x = x.replace('Buy from ', '')
    x = re.sub(r' \- search for (\w|\(|\)|\-)+ specimens', '', x)
    t = x[0]
    for (c_prev, c) in zip(x, x[1:]):
        if c.isupper() and (c_prev.islower() or c_prev == '.') and len(t) > 2:
            z.append(t)
            t = ''
        t += c
    return '|'.join(z)

In [757]:
df = pd.read_csv('../data/mindat/all-combined.csv', index_col=0)

In [758]:
df = remove_synonyms(df)

color has synonym colour
comments has synonym comment
luster has synonym lustre
references has synonym reference


In [759]:
# df.info(max_cols=200)

In [760]:
columns_to_drop = [
    'comments',
    'crystal atlas',
    'crystal structure',
    'download',
    'external links',
    'forms',
    'hardness data',
    'health risks',
    'idealised formula',
    'ima status notes',
    'images',
    'industrial uses',
    'name pronunciation',
    'notes',
    'optical extinction',
    'oxide wt%',
    'pronounciation',
    'reference list',
    'references',
    'sample references',
    'search engines',
    'see also',
    'setting',
    'thermal behaviour',
]

In [761]:
df = df.drop(columns=columns_to_drop)

In [762]:
ima_map = {
    'A' : 'Approved',
    'G' : 'Grandfathered',
    'Rd' : 'Redefined',
    'Rn' : 'Renamed',
    'Q' : 'Questionable',
}

In [763]:
# df['2v'].dropna()[59].split(',')


In [764]:
# df[['gladstone-dale', 'mineral_name']].dropna().head(10)

In [765]:
# df[['hardness mohs', 'hardness vickers']].iloc[0:30, :]

In [766]:
df[['measured 2v', 'calculated 2v']] = df['2v'].apply( lambda x: handle_2v(x) if pd.notna(x) else x).str.split('|', expand=True)
df = df.drop(columns=['2v'])

In [767]:
df['approval year'] = df['approval year'].apply(lambda x: int(x) if pd.notna(x) else x)

In [768]:
df['composition'] = df['composition'].apply( lambda x: x.replace('Molecular Weight = ', '') if pd.notna(x) else x)

In [769]:
df['dana 8th ed.'] = df['dana 8th ed.'].apply( lambda x: x.split(':')[0].strip() if pd.notna(x) else x)

In [770]:
df['dana class'] = df['dana class'].apply(lambda x: x.split(')')[1].strip() if pd.notna(x) else x)

In [771]:
df[['measured density', 'calculated density']] = df['density_x'].apply(lambda x: handle_density_x(x) if pd.notna(x) else x).str.split('|', expand=True)
df['measured density'] = df['measured density'].apply(lambda x : x.split('g')[0] if pd.notna(x) else x)
df = df.drop(columns=['density_x'])

In [772]:
df['density_y'] = df['density_y'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

In [773]:
df[['electron density', 'specific_gravity_new']] = df['electron density'].apply(lambda x: '|'.join(re.findall(r'\d+\.\d+ gm\/cc', x)) if pd.notna(x) else x).str.split('|',expand=True)
df['specific_gravity_new'] = df['specific_gravity_new'].apply(lambda x: x.split('g')[0] if pd.notna(x) else x)

In [774]:
df['elements listed'] = df['elements listed'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [775]:
df[['fermion index', 'boson index']] = df['fermion index'].apply(lambda x: '|'.join(re.findall(r'\d+\.\d+', x)) if pd.notna(x) else x).str.split('|',expand=True)

In [776]:
df['fracture_y'] = df['fracture_y'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [777]:
df['habit'] = df['habit'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [778]:
df[['hardness mohs', 'hardness vickers']] = df['hardness_x'].apply(lambda x: handle_hardness(x) if pd.notna(x) else x).str.split('|',expand=True)
df = df.drop(columns=['hardness_x'])

In [779]:
df['hardness_y'] = df['hardness_y'].apply(lambda x: re.split('\s\-\s', x)[0] if pd.notna(x) else x)

In [780]:
df['hey\'s cim ref.'] = df['hey\'s cim ref.'].apply(lambda x: x.split(':')[0] if pd.notna(x) else x)

In [781]:
df['ima status_y'] = df['ima status_y'].apply(lambda x: ima_map[x.strip()[:2].strip()] if pd.notna(x) else x)

In [782]:
df['mindat.org url'] = df['mindat.org url'].apply(lambda x: x.replace('Please feel free to link to this page.', '') if pd.notna(x) else x)

In [783]:
df['mineral dealers'] = df['mineral dealers'].apply(lambda x: lowerUpperSplit(x) if pd.notna(x) else x)

In [784]:
df['nickel-strunz 10th (pending) ed.'] = df['nickel-strunz 10th (pending) ed.'].apply(lambda x: x.split(':')[0] if pd.notna(x) else x)

In [785]:
df['photoelectric'] = df['photoelectric'].apply(lambda x: re.findall(r'\d+\.\d+ barns\/electron', x)[0] if pd.notna(x) else x)

In [786]:
df['radioactivity'] = df['radioactivity'].apply(lambda x: x.split('(')[0].strip() if pd.notna(x) else x)

In [787]:
df['specific gravity'] = df['specific gravity'].apply(lambda x: re.sub(r"\(.*\)", '', x) if pd.notna(x) else x)

In [788]:
df['strunz class'] = df['strunz class'].apply(lambda x: x.split('-')[1].lower() if pd.notna(x) else x)

In [789]:
df.to_csv('../data/processed-MWI.csv')

density_x
8
1.33 - 1.48
g / cm3(Measured)    1.45
g / cm3(Calculated)
mindat.org
url
1
https: // www.mindat.org / min - 1877.
htmlPlease
feel
free
to
link
to
this
page.
[a - z] + [A - Z][a - z] + jkbdjhdjHGkjbkbd
mineral
dealers
mineral
dealers
24
Blue
Gems
Australian & InternationalQuality
Mineral
Specimens
For
Sale
~ Cal
Neva
Mineral
CompanyFabre
Minerals - search
for Abuite specimensFine Minerals from Weinrich Minerals, Inc.Buy from David K Joyce mineralsWilensky Exquisite MineralsQuality Minerals at Fair PricesTop quality minerals from Kristalle of CaliforniaBuy rare minerals from Excalibur MineralsHigh-end worldwide specimens & outstanding customer serviceWendel Minerals - Auction & Shop
7
Buy
from McDougall MineralsFine

Minerals
from Weinrich Minerals, Inc.Buy
from David K

Joyce
mineralsQuality
Minerals
at
Fair
PricesBlue
Gems
Australian & InternationalQuality
Mineral
Specimens
For
Sale
~ Cal
Neva
Mineral
CompanyHigh - end
worldwide
specimens & outstanding
customer
serviceWilensky
Exquisite
MineralsFabre
Minerals - search
for Abellaite specimensTop quality minerals from Kristalle of CaliforniaWendel Minerals - Auction & Shop