In [69]:
import json
import pandas as pd
import numpy as np
from copy import deepcopy

In [62]:
def to_lower(df, col):
    df[col] = df[col].apply(lambda x: x.lower())
    return df

In [63]:
def process_df(df):
    df.columns = [x.lower() for x in df.columns]
    df.sort_index(axis=1, inplace=True)
    df.sort_values('mineral_name', inplace=True)
    df = to_lower(df, 'mineral_name')
    mid = df['mineral_name']
    df.drop(labels=['mineral_name'], axis=1,inplace = True)
    df.insert(0, 'mineral_name', mid)
    return df

In [64]:
def json_to_df(file_path):
    with open(file_path, 'r') as f:
        obj= json.load(f)
    records = []
    for k, v in obj.items():
        temp = deepcopy(v)
        temp['mineral_name'] = k
        records.append(temp)
    df = pd.DataFrame.from_records(records)
    df = process_df(df)
    return df

In [65]:
"""
use axis = 0 to remove colums
use axis = 1 to remove rows
"""
def filter_df(df, axis, threshold=30):
    min_count = int((threshold / 100) * df.shape[axis] + 1)
    filtered_df = df.dropna(axis= int(not axis), how='any', thresh=min_count).reset_index(drop=True)
    return filtered_df

In [66]:
def get_all_cols(dfs):
    cols = set()
    for df in dfs:
        cols.update(df.columns)
    return cols

In [42]:
def get_column_diff(df1, df2, col1, col2=None):
    if col2 is None:
        col2 = col1
    set_1 = set(df1[col1].tolist()).difference(set(df2[col2].tolist()))
    set_2 = set(df2[col2].tolist()).difference(set(df1[col1].tolist()))
    return set_1, set_2

In [33]:
mindat_df = json_to_df('../data/mindat/mindat-all.json')
mindat_df = filter_df(mindat_df, axis=0, threshold=2)
mindat_df = filter_df(mindat_df, axis=1, threshold=10)
mindat_df.shape

(6363, 73)

In [34]:
ima_df = pd.read_csv('../data/ima_mineral_list/ima-list.csv', index_col=0)
ima_df = ima_df.rename(columns={'name': 'mineral_name'})
ima_df = process_df(ima_df)
ima_df.shape

(5739, 7)

In [35]:
web_df = json_to_df('../data/webmineral/combined_minerals.json')
web_df = web_df.drop(web_df.columns[range(1, 134)], axis=1).reset_index(drop=True)
web_df.shape

(4660, 47)

In [60]:
cols = get_all_cols([mindat_df, web_df, ima_df])
len(cols)

116

### Common

In [43]:
common_df = pd.merge(mindat_df, ima_df, how='outer', on=['mineral_name'])
common_df.shape

(6414, 79)

In [52]:
mindat_minus_ima, ima_minus_mindat = get_column_diff(mindat_df, ima_df, 'mineral_name')
len(mindat_minus_ima), len(ima_minus_mindat)

(675, 51)

In [53]:
print(sorted(ima_minus_mindat))

['baumhauerite ii', 'buserite', 'calcioveatchite', 'chlorkyuygenite', 'cuatrocapaite-(nh4)', 'eleonorite', 'ferri-pedrizite', 'ferro-fluoro-pedrizite', 'ferrohögbomite-2n 2s', 'ferronigerite-2n 1s', 'ferronigerite-6n 6s', 'ferroqingheiite', "ferrotaaffeite-2n' 2s", "ferrotaaffeite-6n' 3s", 'fluorapophyllite-(nh4)', 'fluoro-pedrizite', 'fowlerite', 'georgiadesite', 'gersdorffite-p 213', 'gersdorffite-pa 3', 'gersdorffite-pca 21', 'gismondine', 'hydroxycalcioroméite', 'jagüéite', 'kësterite', 'magnesiobeltrandoite-2n 3s', 'magnesiohögbomite-2n 2s', 'magnesiohögbomite-2n 3s', 'magnesiohögbomite-2n 4s', 'magnesiohögbomite-6n 12s', 'magnesiohögbomite-6n 6s', 'magnesionigerite-2n 1s', 'magnesionigerite-6n 6s', "magnesiotaaffeite-2n '2s", "magnesiotaaffeite-6n' 3s", 'oskarssonite', 'oxycalcioroméite', 'phosphovanadylite-ba', 'pseudowollastonite', 'písekite-(y)', 'rathite-iv', 'redondite', 'roselite-β', 'staněkite', 'straβmannite', 'zincohögbomite-2n 2s', 'zincohögbomite-2n 6s', 'zinconigerite

In [50]:
common_df2 = pd.merge(ima_df, web_df, how='outer', on=['mineral_name'])
common_df2.shape

(6433, 53)

In [57]:
web_minus_ima, ima_minus_web = get_column_diff(web_df, ima_df, 'mineral_name')
len(web_minus_ima), len(ima_minus_web)

(694, 1773)

In [58]:
web_minus_mindat, mindat_minus_web = get_column_diff(web_df, mindat_df, 'mineral_name')
len(web_minus_ima), len(mindat_minus_web)

(694, 2253)

In [67]:
common_df[['ima status_y', 'ima status_x']].head(20)

Unnamed: 0,ima status_y,ima status_x
0,,A
1,Approved IMA 1975,A
2,Approved IMA 1994,A
3,Valid Species (Pre-IMA) 1956,G
4,Approved IMA 1983,A
5,Approved IMA 2006 (Dana # Added),A
6,Approved IMA 1991,A
7,,A
8,Valid Species (Pre-IMA) 1855,G
9,Approved IMA 1975,A


In [101]:
fil_df = filter_df(mindat_df, axis=0, threshold=3)
fil_df = filter_df(fil_df, axis=1, threshold=10)

In [102]:
fil_df.shape

(4351, 67)

### Hardness filter


In [73]:
hardness = mindat_df['hardness'].tolist()

In [74]:
vickers = [any('VH' in y for y in x) for x in hardness if x != np.nan and isinstance(x, list) ]

In [75]:
len(vickers)

4273