In [1]:
import re
import pandas as pd
import numpy as np

from utils import *

* formula_y should be used -> formula_x -> chemical formula
* merge density_y -> measured density (any)
* specific_gravity_new -> specific gravity (any)
* fracture_x -> fracture_y
* ima_staus_y (expand) -> ima_status_x -> ima status
* check localities (locality, type locality, co-type localities, first recorded locality )
* merge axial ratio and ratio
* merge streak_x and streak_y
* merge transparency and diaphenity
* z is number of molecules

* if we space group, then crystal system_y

In [2]:
def handle_2v(x):
    y = x.lower().split(',')[:2]
    y += [np.nan] * (2 - len(y))
    if 'measured' not in y[0]:
        y[1], y[0] = y[0], y[1]
    for i in range(2):
        if pd.notna(y[i]):
            y[i] = re.sub(r"\(.*\)", '', y[i].split(':')[1]).strip()
    #     print(y)
    return '|'.join(str(x) for x in y)

In [3]:
def handle_density_x(x):
    y = re.split(r'\s{2,}', x.lower())[:2]
    y += [np.nan] * (2 - len(y))
    if 'measured' not in y[0]:
        y[1], y[0] = y[0], y[1]
    for i in range(2):
        if pd.notna(y[i]):
            y[i] = re.sub(r"\(.*\)", '', y[i]).strip()
    #     print(y)
    return '|'.join(str(x) for x in y)

In [4]:
def handle_hardness(x):
    x = x.lower().replace('\'', '')
    x = x[1:-1].split(',') if x[0] == '[' else x
    y = [0, 0]  # mohs, vickers
    if isinstance(x, str):
        if 'mohs' in x:
            y = [x.replace('on', '').replace('scale', '').replace('mohs', ''), np.nan]
        else:
            y = [np.nan, x.replace('vickers', '').replace('-', '')]
    else:
        x_m = [z for z in x if 'mohs' in z]
        x_v = [z for z in x if 'vickers' in z]
        y[0] = x_m[0].replace('on', '').replace('scale', '').replace('mohs', '') if len(
            x_m) > 0 else np.nan
        y[1] = x_v[0].replace('vickers', '').replace('-', '') if len(x_v) > 0 else np.nan
    return '|'.join(str(x) for x in y)

In [5]:
def lowerUpperSplit(x):
    z = []
    x = x.replace('Buy from ', '')
    x = re.sub(r' \- search for (\w|\(|\)|\-)+ specimens', '', x)
    t = x[0]
    for (c_prev, c) in zip(x, x[1:]):
        if c.isupper() and (c_prev.islower() or c_prev == '.') and len(t) > 2:
            z.append(t)
            t = ''
        t += c
    return '|'.join(z)

In [6]:
df = pd.read_csv('../data/mindat/all-combined.csv', index_col=0)

In [7]:
df = remove_synonyms(df)

color has synonym colour
comments has synonym comment
luster has synonym lustre
references has synonym reference


In [8]:
columns_to_drop = [
    'comments',
    'crystal atlas',
    'crystal structure',
    'download',
    'external links',
    'forms',
    'hardness data',
    'health risks',
    'idealised formula',
    'ima status notes',
    'images',
    'industrial uses',
    'name pronunciation',
    'notes',
    'optical extinction',
    'oxide wt%',
    'pronounciation',
    'reference list',
    'references',
    'sample references',
    'search engines',
    'see also',
    'setting',
    'thermal behaviour',
]

In [9]:
df = df.drop(columns=columns_to_drop)

In [10]:
ima_map = {
    'A': 'Approved',
    'G': 'Grandfathered',
    'Rd': 'Redefined',
    'Rn': 'Renamed',
    'Q': 'Questionable',
}

In [11]:
df[['measured 2v', 'calculated 2v']] = df['2v'].apply(
    lambda x: handle_2v(x) if pd.notna(x) else x).str.split('|', expand=True)
df = df.drop(columns=['2v'])

In [12]:
df['approval year'] = df['approval year'].apply(lambda x: int(x) if pd.notna(x) else x)

In [13]:
df['composition'] = df['composition'].apply(
    lambda x: x.replace('Molecular Weight = ', '') if pd.notna(x) else x)

In [14]:
df['dana 8th ed.'] = df['dana 8th ed.'].apply(
    lambda x: x.split(':')[0].strip() if pd.notna(x) else x)

In [15]:
df['dana class'] = df['dana class'].apply(lambda x: x.split(')')[1].strip() if pd.notna(x) else x)

In [16]:
df[['measured density', 'calculated density']] = df['density_x'].apply(
    lambda x: handle_density_x(x) if pd.notna(x) else x).str.split('|', expand=True)
df['measured density'] = df['measured density'].apply(
    lambda x: x.split('g')[0] if pd.notna(x) else x)
df = df.drop(columns=['density_x'])

In [17]:
df['density_y'] = df['density_y'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

In [18]:
df[['electron density', 'specific_gravity_new']] = df['electron density'].apply(
    lambda x: '|'.join(re.findall(r'\d+\.\d+ gm\/cc', x)) if pd.notna(x) else x).str.split('|',
                                                                                           expand=True)
df['specific_gravity_new'] = df['specific_gravity_new'].apply(
    lambda x: x.split('g')[0] if pd.notna(x) else x)

In [19]:
df['elements listed'] = df['elements listed'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [20]:
df[['fermion index', 'boson index']] = df['fermion index'].apply(
    lambda x: '|'.join(re.findall(r'\d+\.\d+', x)) if pd.notna(x) else x).str.split('|',
                                                                                    expand=True)

In [21]:
df['fracture_y'] = df['fracture_y'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [22]:
df['habit'] = df['habit'].apply(lambda x: x.split('-')[0] if pd.notna(x) else x)

In [23]:
df[['hardness mohs', 'hardness vickers']] = df['hardness_x'].apply(
    lambda x: handle_hardness(x) if pd.notna(x) else x).str.split('|', expand=True)
df = df.drop(columns=['hardness_x'])

In [611]:
df.columns.tolist()

['mineral_name',
 '2v calculated',
 '2v measured',
 'anisotrophism',
 'approval history',
 'approval year',
 'axial ratios',
 'bireflectance',
 'birefringence',
 'boson index',
 'cell dimensions',
 'cell parameters',
 'class (h-m)',
 'cleavage',
 'color',
 'common impurities',
 'country',
 'dana 7th ed.',
 'dana class',
 'density calculated (in gm/cc)',
 'density measured (in gm/cc)',
 'dichroism (e)',
 'dichroism (w)',
 'dispersion',
 'electron density',
 'elements listed',
 'empirical formula',
 'environment',
 'epitaxial minerals',
 'fermion index',
 'first published',
 'first ref',
 'formula',
 'fracture',
 'geological setting',
 'gladstone-dale',
 'hardness vickers',
 "hey's cim ref.",
 'ima status',
 'ima year',
 'in uv light',
 'internal reflections',
 'ir spectrum',
 'isostructural with',
 'luminescence',
 'luster',
 'magnetism',
 'member of',
 'mindat.org url',
 'mineral dealers',
 'molecular weight',
 'morphology',
 'number of molecules in unit cell',
 'parting',
 'photoelect

In [24]:
df['hardness_y'] = df['hardness_y'].apply(lambda x: re.split('\s\-\s', x)[0] if pd.notna(x) else x)

In [25]:
df['hey\'s cim ref.'] = df['hey\'s cim ref.'].apply(lambda x: x.split(':')[0] if pd.notna(x) else x)

In [26]:
df['ima status_y'] = df['ima status_y'].apply(
    lambda x: ima_map[x.strip()[:2].strip()] if pd.notna(x) else x)

In [27]:
df['mindat.org url'] = df['mindat.org url'].apply(
    lambda x: x.replace('Please feel free to link to this page.', '') if pd.notna(x) else x)

In [28]:
df['mineral dealers'] = df['mineral dealers'].apply(
    lambda x: lowerUpperSplit(x) if pd.notna(x) else x)

In [29]:
df['nickel-strunz 10th (pending) ed.'] = df['nickel-strunz 10th (pending) ed.'].apply(
    lambda x: x.split(':')[0] if pd.notna(x) else x)

In [30]:
df['photoelectric'] = df['photoelectric'].apply(
    lambda x: re.findall(r'\d+\.\d+ barns\/electron', x)[0] if pd.notna(x) else x)

In [31]:
df['radioactivity'] = df['radioactivity'].apply(
    lambda x: x.split('(')[0].strip() if pd.notna(x) else x)

In [32]:
df['specific gravity'] = df['specific gravity'].apply(
    lambda x: re.sub(r"\(.*\)", '', x) if pd.notna(x) else x)

In [33]:
df['strunz class'] = df['strunz class'].apply(
    lambda x: x.split('-')[1].lower() if pd.notna(x) else x)

In [34]:
df.to_csv('../data/processed-MWI.csv')