In [1]:
import pandas as pd
import re
from utils import *

In [2]:
final_df = pd.read_csv('../data/MWIW.csv')

In [3]:
emoji_reg = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)

In [4]:
def handle_wiki_ima(x):
    x = re.split('-', x)[0]
    x = re.sub('IMA', '', x)
    return x

In [5]:
final_df['approval year'] = final_df['approval year'].apply(lambda x: int(x) if pd.notna(x) else x)

In [6]:
final_df['ima year'] = final_df['ima year'].apply(lambda x: re.split('-', x)[0] if pd.notna(x) else x)

In [7]:
final_df['ima number, broad sense'] = final_df['ima number, broad sense'].apply(lambda x: handle_wiki_ima(x) if pd.notna(x) else x)

In [8]:
final_df['ima year'] = final_df['ima year'].combine_first(final_df['approval year'])
final_df['ima year'] = final_df['ima year'].combine_first(final_df['ima number, broad sense'])

In [9]:
final_df['strunz 10th ed.'] = final_df['nickel-strunz 10th (pending) ed.']

In [10]:
final_df['radioactivity (in GRapi)'] = final_df['radioactivity'].apply(lambda x: re.split('=', x)[1] if pd.notna(x) else x)

In [11]:
final_df['refractive index values'] = final_df['refractive'].combine_first(final_df['ri values'])

In [12]:
final_df['molecular weight (in gm)'] = final_df['molecular weight'].apply(lambda x: re.split('g', x)[0] if pd.notna(x) else x)

In [13]:
final_df['photoelectric (in barns/electron)'] = final_df['photoelectric'].apply(lambda x: re.split('b', x)[0] if pd.notna(x) else x)

In [14]:
final_df['locality'] = final_df['locality'].apply(lambda x: emoji_reg.sub('', x) if pd.notna(x) else x)

In [15]:
final_df['density calculated (in gm/cc)'] = final_df['density calculated (in gm/cc)'].apply(lambda x: re.split('g', x)[0] if pd.notna(x) else x)

In [16]:
final_df['class'] = final_df['class'].combine_first(final_df['class (h-m)'])

In [17]:
final_df['electron density (in gm/cc)'] = final_df['electron density'].apply(lambda x: re.split('g', x)[0] if pd.notna(x) else x)

In [18]:
final_df['optical data'] = final_df['type']

In [19]:
final_df.drop(columns=['boson index',
                       'gladstone-dale',
                       'class (h-m)',
                       'ir spectrum',
                       'approval year',
                       'ima number, broad sense',
                       'electron density',
                       'molecular weight',
                       'photoelectric',
                       'powder diffraction',
                       'nickel-strunz 10th (pending) ed.',
                       'radioactivity',
                       'reflectivity',
                       'refractive',
                       'ri values',
                       'x ray diffraction',
                       'cell dimensions',
                       'cell parameters',
                       'type'
                      ], inplace=True)

In [20]:
final_df = process_df(final_df)
final_df = strip_data(final_df)

In [21]:
final_df.to_csv('../data/clean_MWIW.csv', index=None)

In [22]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7079 entries, 0 to 7078
Data columns (total 87 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   mineral_name                       7079 non-null   object
 1   2v calculated                      1645 non-null   object
 2   2v measured                        1892 non-null   object
 3   anisotrophism                      806 non-null    object
 4   approval history                   322 non-null    object
 5   axial ratios                       5452 non-null   object
 6   bireflectance                      256 non-null    object
 7   birefringence                      632 non-null    object
 8   class                              4570 non-null   object
 9   cleavage                           4685 non-null   object
 10  color                              5907 non-null   object
 11  common impurities                  1593 non-null   object
 12  countr

In [23]:
df = filter_df(final_df, axis=1, threshold=25)
df = filter_df(df, axis=0, threshold=25)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5800 entries, 0 to 5799
Data columns (total 56 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   mineral_name                       5800 non-null   object
 1   2v calculated                      1641 non-null   object
 2   2v measured                        1889 non-null   object
 3   axial ratios                       5116 non-null   object
 4   class                              4213 non-null   object
 5   cleavage                           4642 non-null   object
 6   color                              5674 non-null   object
 7   common impurities                  1582 non-null   object
 8   country                            5352 non-null   object
 9   crystal system                     5800 non-null   object
 10  dana 8th ed.                       3716 non-null   object
 11  dana class                         4571 non-null   object
 12  densit

In [25]:
df.to_csv('../data/processed_MWIW.csv', index=None)