In [1]:
import re
from collections import defaultdict

from utils import *

In [2]:
p_reg = r'\(p.*\)'
q_reg = r'\(Q.*\)'

In [3]:
def extra_in_wiki(full_df, wiki_df, col):
    count = 0
    for value in wiki_df['mineral_name'].tolist():
        if not pd.notna(full_df[full_df['mineral_name'] == value][col].tolist()[0]):
            count += 1
    return count

In [4]:
wiki_df = pd.read_csv('../data/wikipedia/data.csv', index_col=0)
mwi_df = pd.read_csv('../data/merged-MWI.csv')

In [5]:
wiki_df = wiki_df.reset_index(drop=True)
wiki_df = wiki_df.rename(columns={'Mineral Name': 'mineral_name'})
wiki_df = process_df(wiki_df)
wiki_df.shape

(1556, 63)

In [6]:
wiki_df = wiki_df[~(wiki_df['mineral_name'] == 'nitre')]

In [7]:
wiki_df = remove_synonyms(wiki_df)

In [8]:
wiki_df.drop(columns=set(wiki_df.columns).intersection(mwi_df.columns).difference(
    {'mineral_name', 'habit', 'tenacity'}), inplace=True)

In [9]:
for col in wiki_df.columns:
    if len(re.findall(p_reg, col)) != 0:
        wiki_df[re.sub(p_reg, '', col)] = wiki_df[col].apply(lambda x: re.sub(q_reg, '', str(x)))
        wiki_df.drop(columns=[col], inplace=True)

In [10]:
wiki_df['nickel-strunz \'10th ed\', review of (9th ed/ 2009 update) '] = wiki_df[
    'nickel-strunz \'10th ed\', review of (9th ed/ 2009 update) '].combine_first(
    wiki_df['nickel-strunz 9th edition (updated 2009) '])

In [11]:
wiki_df.columns

Index(['mineral_name', '2v', 'caption', 'class', 'colour', 'dana', 'density',
       'description', 'diaphaneity', 'fluorescence', 'gravity', 'habit',
       'mohs', 'molweight', 'name', 'opticalprop', 'other', 'refractive',
       'solubility', 'strunz', 'symmetry', 'system', 'tenacity', 'unit cell',
       'art & architecture thesaurus id ', 'babelnet id ', 'commons category ',
       'crystal system ', 'dana 8th edition ', 'described by source ',
       'elhuyar zth id ', 'encyclopædia britannica online id ',
       'encyclopædia universalis id ', 'freebase id ',
       'google knowledge graph id ', 'gran enciclopèdia catalana id ',
       'great russian encyclopedia online id ', 'ima number, broad sense ',
       'ima status and/or rank ', 'kbpedia id ', 'kivid.info id ',
       'microsoft academic id ', 'mindat mineral id ', 'mohs' hardness ',
       'named after ',
       'nickel-strunz '10th ed', review of (9th ed/ 2009 update) ',
       'nickel-strunz 9th edition (updated 2009)

In [12]:
columns_to_drop = [
    '2v', 'caption', 'colour', 'dana', 'density',
    'description', 'fluorescence', 'molweight', 'name', 'other',
    'solubility', 'unit cell',
    'art & architecture thesaurus id ', 'babelnet id ', 'commons category ',
    'crystal system ', 'described by source ',
    'elhuyar zth id ',
    'encyclopædia universalis id ',
    'gran enciclopèdia catalana id ',
    'great russian encyclopedia online id ',
    'ima status and/or rank ', 'kbpedia id ',
    'mindat mineral id ', 'mohs\' hardness ',
    'nickel-strunz 9th edition (updated 2009) ',
    'solid solution series with ',
    'store norske leksikon id ',
    'wolfram language entity code '

]

In [13]:
wiki_df = wiki_df.drop(columns=columns_to_drop)

In [14]:
columns_to_rename = {
    'diaphaneity': 'transparency',
    'gravity': 'specific gravity (in gm/cc)',
    'mohs': 'hardness mohs',
    'opticalprop': 'optical data',
    'strunz': 'strunz 9th ed.',
    'system': 'crystal system',
    'unit cell': 'cell dimensions',
    'dana 8th edition ': 'dana 8th ed.',
    'named after ': 'name origin',
    'nickel-strunz \'10th ed\', review of (9th ed/ 2009 update) ': 'nickel-strunz 10th (pending) ed.',
    'space group ': 'space group',
    'streak color ': 'streak',
    'strunz 8th edition (series id, updated) ': 'strunz 8th ed.',
    'type locality (geology) ': 'locality',

}

In [15]:
wiki_df = wiki_df.rename(columns=columns_to_rename)

In [16]:
def handle_crystal_sys(x):
    x = re.sub('[\[\]]', '', x)
    x = re.sub('crystal system', '', x)
    x = x.split('|')[0]
    x = x.split('<')[0]
    return x

In [17]:
def handle_space(x):
    x = re.sub('[space group]', '', x)
    x = re.split('\/', x)[0]
    return x

In [18]:
def romanToInt(s):
    roman = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000, 'IV': 4, 'IX': 9,
             'XL': 40, 'XC': 90, 'CD': 400, 'CM': 900}
    i = 0
    num = 0
    while i < len(s):
        if i + 1 < len(s) and s[i:i + 2] in roman:
            num += roman[s[i:i + 2]]
            i += 2
        else:
            num += roman[s[i]]
            i += 1
    return str(num)

In [19]:
def handle_strunz_8(x):
    rom = x.split('/')[0]
    try:
        int_str = romanToInt(rom)
    except:
        return np.nan
    x = int_str + '/' + x.split('/')[1]
    return x

In [20]:
def handle_refractive(x):
    x = re.sub('<br.*?>', ',', x)
    x = re.sub('<.*?sub.*?>', '', x)
    x = re.sub('\(.*\)', '', x)
    return x

In [21]:
def handle_class(x):
    x = re.sub('[\[\]\']', '', x)
    x = x.split('(')[0]
    return x

In [22]:
wiki_df['specific gravity (in gm/cc)'] = wiki_df['specific gravity (in gm/cc)'].apply(
    lambda x: re.split('[a-z\(\,\{]', x)[0] if pd.notna(x) else x)

In [23]:
wiki_df['hardness mohs'] = wiki_df['hardness mohs'].apply(
    lambda x: re.split('[a-z\(\,\{\-\&\~]', x)[0] if pd.notna(x) else x)

In [24]:
wiki_df['strunz 9th ed.'] = wiki_df['strunz 9th ed.'].apply(
    lambda x: re.split('[\(]', x)[0] if pd.notna(x) else x)

In [25]:
wiki_df['tenacity'] = wiki_df['tenacity'].apply(
    lambda x: x.replace('[[\[\]]', '') if pd.notna(x) else x)

In [26]:
wiki_df['crystal system'] = wiki_df['crystal system'].apply(
    lambda x: handle_crystal_sys(x) if pd.notna(x) else x)

In [27]:
wiki_df['name origin'] = wiki_df['name origin'].apply(
    lambda x: re.sub('[\[\]\']', '', x) if pd.notna(x) else x)

In [28]:
wiki_df['locality'] = wiki_df['locality'].apply(
    lambda x: re.sub('[\[\]\']', '', x) if pd.notna(x) else x)

In [29]:
wiki_df['space group'] = wiki_df['space group'].apply(
    lambda x: handle_space(x) if pd.notna(x) else x)

In [30]:
wiki_df['strunz 8th ed.'] = wiki_df['strunz 8th ed.'].apply(
    lambda x: handle_strunz_8(x) if pd.notna(x) else x)

In [31]:
wiki_df['refractive'] = wiki_df['refractive'].apply(
    lambda x: handle_refractive(x) if pd.notna(x) else x)

In [32]:
wiki_df['class'] = wiki_df['class'].apply(lambda x: handle_class(x) if pd.notna(x) else x)

In [33]:
wiki_df['symmetry'] = wiki_df['symmetry'].apply(
    lambda x: re.sub('\[\[.*\]\]|{{.*}}|<[^>]*>|\([^\)]*\)|\'|\:', '', x) if pd.notna(x) else x)

In [34]:
wiki_df.columns = [x.strip() for x in wiki_df.columns]

In [35]:
df = pd.merge(mwi_df, wiki_df, how='outer', on='mineral_name')

In [36]:
attr_dic = defaultdict(lambda: defaultdict(list))

In [37]:
attr_dic['Crystal System']=                         ['Crystal System_x', 'Crystal System_y']
attr_dic['dana 8th ed.']=                           ['dana 8th ed._x', 'dana 8th ed._y']
attr_dic['Habit']=                                  ['Habit_x', 'Habit_y']
attr_dic['hardness mohs']=                          ['hardness mohs_x', 'hardness mohs_y']
attr_dic['Locality']=                               ['locality_x', 'locality_y']
attr_dic['Name Origin']=                            ['Name Origin_x', 'Name Origin_y']
attr_dic['Nickel-Strunz 10th (pending) ed.']=       ['Nickel-Strunz 10th (pending) ed._x','Nickel-Strunz 10th (pending) ed._y']
attr_dic['Optical Data']=                           ['Optical Data_x', 'Optical Data_y']
attr_dic['Space Group']=                            ['Space Group_x', 'Space Group_x']
attr_dic['specific gravity (in gm/cc)']=            ['specific gravity (in gm/cc)_x','specific gravity (in gm/cc)_y']
attr_dic['Streak']=                                 ['Streak_x', 'Streak_y']
attr_dic['Strunz 8th ed.']=                         ['Strunz 8th ed._x', 'Strunz 8th ed._y']
attr_dic['strunz 9th ed.']=                         ['strunz 9th ed._x', 'strunz 9th ed._y']
attr_dic['Tenacity']=                               ['Tenacity_x', 'Tenacity_y']
attr_dic['Transparency']=                           ['Transparency_x', 'Transparency_y']

In [38]:
for k, v in attr_dic.items():
    k = k.lower()
    v = [x.lower() for x in v]
    df[k] = df[v[0]].combine_first(df[v[1]])
    df.drop(columns=v, inplace=True)

In [39]:
df['space group'] = df['space group'].combine_first(df['space group_y'])
df.drop(columns=['space group_y'], inplace=True)

In [40]:
df['encyclopedia britannica online id'] = df['encyclopædia britannica online id'].copy()
df.drop(columns=['encyclopædia britannica online id'], inplace=True)

In [41]:
df = df.replace('', np.nan, regex=True)

In [42]:
df = process_df(df)

In [43]:
for col in df.columns:
    df[col] = df[col].astype(str).str.strip()

In [44]:
df.to_csv('../data/MWIW.csv', index=None)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7079 entries, 0 to 7078
Data columns (total 100 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   mineral_name                       7079 non-null   object
 1   2v calculated                      7079 non-null   object
 2   2v measured                        7079 non-null   object
 3   anisotrophism                      7079 non-null   object
 4   approval history                   7079 non-null   object
 5   approval year                      7079 non-null   object
 6   axial ratios                       7079 non-null   object
 7   bireflectance                      7079 non-null   object
 8   birefringence                      7079 non-null   object
 9   boson index                        7079 non-null   object
 10  cell dimensions                    7079 non-null   object
 11  cell parameters                    7079 non-null   object
 12  class