In [123]:
import os
import sys
import pandas as pd
import re
from nltk.corpus import wordnet
import unidecode

In [124]:
p_reg = r'\(p.*\)'
q_reg = r'\(Q.*\)'

In [125]:
def get_synonyms(word):
    syns = [lm.name() for syn in wordnet.synsets(word) for lm in syn.lemmas()]
    return set(syns)

In [126]:
def to_lower(df, col):
    df[col] = df[col].apply(lambda x: x.lower())
    return df

In [127]:
def remove_synonyms(df):
    done_with = set()
    for col in df.columns:
        for syn in get_synonyms(col):
            if syn != col and syn in df.columns and syn not in done_with:
                print(f"{col} has synonym {syn}")
                df[col] = df[col].combine_first(df[syn])
                df = df.drop(columns=[syn])
                done_with.update([syn, col])
    return df

In [128]:
def process_df(df):
    df.columns = [x.lower() for x in df.columns]
    df.sort_index(axis=1, inplace=True)
    df.sort_values('mineral_name', inplace=True)
    df = to_lower(df, 'mineral_name')
    df['mineral_name'] = df['mineral_name'].apply(lambda x: unidecode.unidecode(x))
    mid = df['mineral_name']
    df.drop(labels=['mineral_name'], axis=1,inplace = True)
    df.insert(0, 'mineral_name', mid)
    return df

In [129]:
def get_column_diff(df1, df2, col1, col2=None):
    if col2 is None:
        col2 = col1
    set_1 = set(df1[col1].tolist()).difference(set(df2[col2].tolist()))
    set_2 = set(df2[col2].tolist()).difference(set(df1[col1].tolist()))
    return set_1, set_2

In [130]:
"""
use axis = 0 to remove colums
use axis = 1 to remove rows
"""
def filter_df(df, axis, threshold=30):
    min_count = int((threshold / 100) * df.shape[axis] + 1)
    filtered_df = df.dropna(axis= int(not axis), how='any', thresh=min_count).reset_index(drop=True)
    return filtered_df

In [131]:
def extra_in_wiki(full_df, wiki_df, col):
    count = 0
    for value in wiki_df['mineral_name'].tolist():
        # print(full_df[full_df['mineral_name'] == value][col].tolist())
        if len(full_df[full_df['mineral_name'] == value][col].tolist()) == 0:
            count+=1
    return count

In [132]:
wiki_df = pd.read_csv('../data/wikipedia/Wikipedia Data.csv', index_col=0)
mwi_df = pd.read_csv('../data/merged-MWI.csv')

In [133]:
wiki_df = wiki_df.reset_index(drop=True)

In [145]:
wiki_df.shape

(1556, 39)

In [135]:
wiki_df = wiki_df.rename(columns={'Mineral Name': 'mineral_name'})

In [136]:
wiki_df = process_df(wiki_df)

In [137]:
wiki_df = wiki_df[~(wiki_df['mineral_name'] == 'nitre')]

In [138]:
wiki_df = remove_synonyms(wiki_df)

In [139]:
wiki_df = filter_df(wiki_df, axis = 0, threshold=20)
# filter_df(wiki_df, axis = 1, threshold=20).shape

In [140]:
wiki_df.drop(columns=set(wiki_df.columns).intersection(mwi_df.columns).difference({'mineral_name'}), inplace=True)

In [141]:
for col in wiki_df.columns:
    if len(re.findall(p_reg, col)) != 0:
        wiki_df[re.sub(p_reg, '', col)]=wiki_df[col].apply(lambda x: re.sub(q_reg, '', str(x)))
        wiki_df.drop(columns=[col], inplace=True)

In [None]:
columns_to_drop = [
    'caption',
    'name',
    'other',
    'download',
    'external links',
    'forms',
    'hardness data',
    'health risks',
    'idealised formula',
    'ima status notes',
    'images',
    'industrial uses',
    'name pronunciation',
    'notes',
    'optical extinction',
    'oxide wt%',
    'pronounciation',
    'reference list',
    'references',
    'sample references',
    'search engines',
    'see also',
    'setting',
    'thermal behaviour',
]

In [None]:
df = df.drop(columns=columns_to_drop)

In [142]:
wiki_df.columns

Index(['mineral_name', '2v', 'caption', 'class', 'dana', 'diaphaneity',
       'gravity', 'habit', 'mohs', 'name', 'opticalprop', 'other',
       'refractive', 'strunz', 'symmetry', 'system', 'unit cell',
       'commons category ', 'crystal system ', 'dana 8th edition ',
       'described by source ', 'encyclopædia britannica online id ',
       'freebase id ', 'google knowledge graph id ',
       'ima number, broad sense ', 'ima status and/or rank ', 'kivid.info id ',
       'microsoft academic id ', 'mindat mineral id ', 'named after ',
       'nickel-strunz '10th ed', review of ', 'nickel-strunz 9th edition ',
       'space group ', 'store norske leksikon id ', 'streak color ',
       'strunz 8th edition ', 'subclass of ', 'type locality ',
       'wolfram language entity code '],
      dtype='object')

wiki_df.info()

In [143]:
wiki_minus_mwi, mwi_minus_wiki = get_column_diff(wiki_df, mwi_df, 'mineral_name')

In [144]:
len(wiki_minus_mwi), len(mwi_minus_wiki)

(0, 5523)

wiki_df.to_csv('../data/wikipedia/Wikipedia Data.csv')