In [117]:
import json
import pandas as pd
import numpy as np
from copy import deepcopy

In [118]:
def to_lower(df, col):
    df[col] = df[col].apply(lambda x: x.lower())
    return df

In [119]:
def process_df(df):
    df.columns = [x.lower() for x in df.columns]
    df.sort_index(axis=1, inplace=True)
    df.sort_values('mineral_name', inplace=True)
    df = to_lower(df, 'mineral_name')
    mid = df['mineral_name']
    df.drop(labels=['mineral_name'], axis=1,inplace = True)
    df.insert(0, 'mineral_name', mid)
    return df

In [120]:
def json_to_df(file_path):
    with open(file_path, 'r') as f:
        obj= json.load(f)
    records = []
    for k, v in obj.items():
        temp = deepcopy(v)
        temp['mineral_name'] = ''.join(k.split())
        records.append(temp)
    df = pd.DataFrame.from_records(records)
    df = process_df(df)
    return df

In [121]:
"""
use axis = 0 to remove colums
use axis = 1 to remove rows
"""
def filter_df(df, axis, threshold=30):
    min_count = int((threshold / 100) * df.shape[axis] + 1)
    filtered_df = df.dropna(axis= int(not axis), how='any', thresh=min_count).reset_index(drop=True)
    return filtered_df

In [122]:
def get_all_cols(dfs):
    cols = set()
    for df in dfs:
        cols.update(df.columns)
    return cols

In [123]:
def get_column_diff(df1, df2, col1, col2=None):
    if col2 is None:
        col2 = col1
    set_1 = set(df1[col1].tolist()).difference(set(df2[col2].tolist()))
    set_2 = set(df2[col2].tolist()).difference(set(df1[col1].tolist()))
    return set_1, set_2

In [124]:
mindat_df = json_to_df('../data/mindat/mindat-all.json')
mindat_df = filter_df(mindat_df, axis=0, threshold=1)
# mindat_df = filter_df(mindat_df, axis=1, threshold=10)
mindat_df.shape

(9112, 81)

In [125]:
ima_df = pd.read_csv('../data/ima_mineral_list/ima-list.csv', index_col=0)
ima_df = ima_df.rename(columns={'name': 'mineral_name'})
ima_df['mineral_name'] = ima_df['mineral_name'].apply(lambda x: ''.join(x.split()))
ima_df = process_df(ima_df)
ima_df.shape

(5739, 7)

In [126]:
web_df = json_to_df('../data/webmineral/combined_minerals.json')
web_df = web_df.drop(web_df.columns[range(1, 134)], axis=1).reset_index(drop=True)
web_df.shape

(4660, 47)

In [93]:
cols = get_all_cols([mindat_df, web_df, ima_df])
len(cols)

124

### Common

In [127]:
common_df = pd.merge(mindat_df, ima_df, how='outer', on=['mineral_name'])
common_df.shape

(9142, 87)

In [128]:
mindat_minus_ima, ima_minus_mindat = get_column_diff(mindat_df, ima_df, 'mineral_name')
len(mindat_minus_ima), len(ima_minus_mindat)

(3403, 30)

In [96]:
print(sorted(ima_minus_mindat))

['baumhaueriteii', 'buserite', 'calcioveatchite', 'chlorkyuygenite', 'cuatrocapaite-(nh4)', 'eleonorite', 'ferri-pedrizite', 'ferro-fluoro-pedrizite', 'ferroqingheiite', "ferrotaaffeite-2n'2s", 'fluorapophyllite-(nh4)', 'fluoro-pedrizite', 'fowlerite', 'georgiadesite', 'gismondine', 'hydroxycalcioroméite', 'jagüéite', "magnesiotaaffeite-2n'2s", "magnesiotaaffeite-6n'3s", 'oskarssonite', 'oxycalcioroméite', 'phosphovanadylite-ba', 'pseudowollastonite', 'písekite-(y)', 'rathite-iv', 'redondite', 'roselite-β', 'staněkite', 'straβmannite', 'zvĕstovite-(zn)']


In [116]:
# common_df2[['density_x', 'density_y']].dropna()

Unnamed: 0,density_x,density_y
7,1.33 - 1.48 g/cm3 (Measured) 1.45 g/cm3 (Ca...,1.45
8,3.21 g/cm3 (Measured) 3.27 g/cm3 (Calculated),3.21
9,3.32 g/cm3 (Measured) 3.572 g/cm3 (Calculated),"3.31 - 3.57, Average = 3.44"
10,4.42 g/cm3 (Measured) 4.417 g/cm3 (Calculated),"4.29 - 4.34, Average = 4.31"
22,4.96 g/cm3 (Calculated),4.96
...,...,...
9058,5.15 g/cm3 (Calculated),5.15
9063,2.88 g/cm3 (Measured),2.88
9064,3.146 g/cm3 (Measured) 3.14 g/cm3 (Calculated),3.146
9066,13.32 g/cm3 (Measured) 13.42 g/cm3 (Calcula...,13.32


In [113]:
# same = [( 'cleavage_y', 'cleavage_x'), ('crystal system_x', 'crystal system_y')]

In [115]:
# for (x, y) in same:
#     common_df2[x] = common_df2[x].combine_first(common_df2[y])
#     print(common_df2[x].notna().sum())
#     common_df2.drop(columns=[y], inplace=True)

4757
6529


In [129]:
common_df2 = pd.merge(common_df, web_df, how='outer', on=['mineral_name'])
common_df2.shape
common_df2.sort_index(axis=1, inplace=True)
for i, c in enumerate(sorted(common_df2.columns)):
    print(i , ": ", c)


0 :  2v
1 :  anisotropism
2 :  approval history
3 :  approval year
4 :  axial ratios
5 :  bireflectance
6 :  birefringence
7 :  cell dimensions
8 :  cell parameters
9 :  chemical formula
10 :  class (h-m)
11 :  cleavage_x
12 :  cleavage_y
13 :  co-type localities
14 :  color
15 :  colour
16 :  colour in reflected light
17 :  comment
18 :  comments
19 :  common impurities
20 :  composition
21 :  country
22 :  crystal atlas
23 :  crystal structure
24 :  crystal system_x
25 :  crystal system_y
26 :  dana 7th ed.
27 :  dana 8th ed.
28 :  dana class
29 :  density_x
30 :  density_y
31 :  diaphaneity
32 :  dichroism (e)
33 :  dichroism (w)
34 :  dimorph of
35 :  dispersion
36 :  download
37 :  electron density
38 :  elements listed
39 :  empirical formula
40 :  environment
41 :  external links
42 :  fermion index
43 :  first published
44 :  first recorded locality
45 :  first ref
46 :  forms
47 :  formula_x
48 :  formula_y
49 :  fracture_x
50 :  fracture_y
51 :  geological setting
52 :  glads

In [130]:
common_df2.to_csv('../data/mindat/all-combined.csv')

In [69]:
web_minus_ima, ima_minus_web = get_column_diff(web_df, ima_df, 'mineral_name')
len(web_minus_ima), len(ima_minus_web)

(690, 1769)

In [58]:
web_minus_mindat, mindat_minus_web = get_column_diff(web_df, mindat_df, 'mineral_name')
len(web_minus_ima), len(mindat_minus_web)

(694, 2253)

In [67]:
common_df[['ima status_y', 'ima status_x']].head(20)

Unnamed: 0,ima status_y,ima status_x
0,,A
1,Approved IMA 1975,A
2,Approved IMA 1994,A
3,Valid Species (Pre-IMA) 1956,G
4,Approved IMA 1983,A
5,Approved IMA 2006 (Dana # Added),A
6,Approved IMA 1991,A
7,,A
8,Valid Species (Pre-IMA) 1855,G
9,Approved IMA 1975,A


In [101]:
fil_df = filter_df(mindat_df, axis=0, threshold=3)
fil_df = filter_df(fil_df, axis=1, threshold=10)

In [102]:
fil_df.shape

(4351, 67)

### Hardness filter


In [73]:
hardness = mindat_df['hardness'].tolist()

In [74]:
vickers = [any('VH' in y for y in x) for x in hardness if x != np.nan and isinstance(x, list) ]

In [75]:
len(vickers)

4273

## SOme other stuff

In [8]:
old_df = pd.r('../data/mindat/mindat-7k-all.json')
df = json_to_df('../data/mindat/mindat-all.json')
old_df.columns == df.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])