In [1]:
import json
import re
import string

import requests
from bs4 import BeautifulSoup

In [2]:
"""
Global variables to store mineral data.
"""
all_mineral_dict = {}
min_dict = {}
skipped = []
collapse_finder = re.compile(r"collapse[0-9]+")
FILE_NUM = 55

In [3]:
def add_key_to_dict(key, value):
    """
    When scraping, each key ends with a colon, so we first find the
    colon from the end and take the string till it.
    The key may have a value before so we push the new as well as old value in a list.
    If key is not present, then we simply add its value in form of string.

    :param key: The key to add in dictionary
    :param value: The value associated with the key
    :return: None
    """
    global min_dict
    ind = key.rfind(':')
    if ind == -1: return
    key = key[:ind]

    if key not in min_dict:
        min_dict[key] = value
    elif isinstance(min_dict[key], list):
        min_dict[key].append(value)
    else:
        min_dict[key] = [min_dict[key], value]

In [32]:
def get_intro_data(div):
    """
    There is a section in mindat which corresponds to introduction of mineral.
    Here we find that div and retrieve its attributes.
    :param div:  The div where introdata class div has to be located.
    :return: None
    """
    try:
        intro = div.find("div", {"id": "introdata"})
        for x in intro.find_all("div", recursive=False):
            try:
                key = x.span.text
                value = x.div.text
                add_key_to_dict(key, value)
            except Exception as e:
                print(e)
    except Exception as e:
        print(e)

In [33]:
def get_other_attributes(divs):
    """
    Apart from introdata, we retrieve attributes from div that have class
    of mindatarow. Some preprocessing is also required sometimes to get quality data.

    :param divs: A list of divs to search for attributes
    :return:  None
    """
    for div in divs:
        try:
            rows = div.find_all('div', {'class': 'mindatarow'})
            for row in rows:
                text = row.find_all('div', recursive=False)
                if len(text) != 2: continue
                try:
                    key = text[0].text
                    if lis := text[1].find_all('li', recursive=False):
                        value = '\n'.join([x.text for x in lis])
                    else:
                        value = text[1].text
                    add_key_to_dict(key, value)
                except Exception as e:
                    print(e)
        except Exception as e:
            print(e)


In [34]:
def dump_data():
    with open(f'../data/mindat/collected_data-{FILE_NUM}.json', 'w') as f:
        json.dump(all_mineral_dict, f)
    with open(f'../data/mindat/skipped_data-{FILE_NUM}.txt', 'w') as f:
        f.write('\n'.join(str(x) for x in skipped))

In [35]:
def start_parsing(index, html_text):
    """

    Parses the html text to obtain all the attributes by first moving down
    the html tree and then calling the above mentioned functions. At last
    dumps the data in output JSON file.

    :param index: Mindat mineral ID to process
    :param html_text: parses the html text using BeautifulSoup
    :return: None
    """
    global min_dict
    global skipped
    min_dict = {}

    soup = BeautifulSoup(html_text, "html.parser")
    try:
        container = soup.body.find("div", {"id": "mainwrap"}).find("div",
                                                                   {"class": "centerer"}).find(
            "div", {"class": "mindatadiv"}).find("div", {"class": "fpbox990nl"})
        mineral_name = container.find("h1", {"class": "mineralheading"}).text
        print(mineral_name)
        divs = container.find_all("div", {"id": collapse_finder})
        get_intro_data(divs[0])
        get_other_attributes(divs)

        all_mineral_dict[mineral_name] = {}
        for k, val in min_dict.items():
            if isinstance(val, list):
                val = list(set(val))
                if len(val) == 1:
                    val = val[0]
            all_mineral_dict[mineral_name][k] = val

    except Exception as e:
        print(e)
        print("Extraction failed")
        skipped.append(num)
    dump_data()

In [6]:
mins = []


def scrape(html_text):
    """
    This is repsonsible to extract out the mineral IDS of core minerals in mindat that
    are present in A-Z index from the <a> tag

    :param html_text: html text for index
    :return: None
    """
    global min_dict
    global skipped
    min_dict = {}

    soup = BeautifulSoup(html_text, "html.parser")
    try:
        container = soup.body.find("div", {"id": "mainwrap"}).find("div",
                                                                   {"class": "centerer"}).find(
            "div", {"class": "fpbox720p"}).find("div", {"class": "newminsearchresults"})
        divs = container.find_all("a", href=True)
        for a in divs:
            mins.append(a['href'][4:-5])

    except Exception as e:
        print(e)
        print("Extraction failed")
        skipped.append(num)

In [7]:
"""
Loop to get all mineral IDs
"""
for c in string.ascii_uppercase:
    base_url = f"https://www.mindat.org/index-{c}.html"
    #     url = f"{base_url}/min-{num}.html"
    html_text = requests.get(base_url).text
    scrape(html_text)
    print(f"Page number {c} parsed.\n")

Page number A parsed.

Page number B parsed.

Page number C parsed.

Page number D parsed.

Page number E parsed.

Page number F parsed.

Page number G parsed.

Page number H parsed.

Page number I parsed.

Page number J parsed.

Page number K parsed.

Page number L parsed.

Page number M parsed.

Page number N parsed.

Page number O parsed.

Page number P parsed.

Page number Q parsed.

Page number R parsed.

Page number S parsed.

Page number T parsed.

Page number U parsed.

Page number V parsed.

Page number W parsed.

Page number X parsed.

Page number Y parsed.

Page number Z parsed.



In [8]:
ms = [x for x in mins]
len(ms)

5747

In [9]:
with open('../data/mindat/all-mins.txt', 'w') as f:
    f.write('\n'.join(ms))

In [None]:
"""
Loop to query data of all minerals.
"""
base_url = "https://www.mindat.org"

with open('../data/mindat/skipped_data-50.txt') as f:
    nums = sorted(set([int(x.strip()) for x in f]))

for num in nums:
    url = f"{base_url}/min-{num}.html"
    html_text = requests.get(url).text
    start_parsing(num, html_text)
    print(f"Page number {num} parsed.\n")


In [10]:
# base_url = "https://www.mindat.org"

# for num in range(7000, 10000):
#     url = f"{base_url}/min-{num}.html"
#     html_text = requests.get(url).text
#     start_parsing(num, html_text)
#     print(f"Page number {num} parsed.\n")
