# DeepShrooms

Our goal is to classify pictures of (common) mushrooms using some kind of web-app.

Challenge is to get good quality data for the training and then negate some common problems with images such as: lighting, angle, blurriness and background noise.

Current plan is to classify only the poisonous mushrooms of Finland along with some common edible and un-edible ones. Probably using Convolutional Neural Network.


Preliminary Model:
- name-fin - Finnish name
- name-eng - English name
- name-latin - Latin name
- url-mw - Mushroom-world url
- url-wiki? - Wikipedia url
- url-lajit? - Lajit.fi url
- type - edible/poisonous/un-edible(or neutral?)

## Sources of data

Mushroom World
http://www.mushroom.world

Lajit
http://tun.fi/HBF.25786?locale=fi


In [39]:
import pandas as pd
import numpy as np

test_labels = pd.read_csv('test_labels.csv')

# BIG WARNING

We are here going to scrape the data from the Mushroom World -website. Getting the basic data is kinda tiny but fetching the pictures will probably be in range of hundreds of MBs. Which is why this part of code will be either commented out or moved elsewhere and the images will be saved in Google Drive and downloaded from there.

In [27]:
from bs4 import BeautifulSoup
import requests
import sys
import re
import json

# Scrape mushroom url
#
# scrape_mushroom_url takes as input an url to a page in mushroom.world containing information
# related to a single mushroom and returns a python dictionary of information (including image url's) 
# related to the mushroom.
#
# @param url An url to a mushroom web page in mushroom.world
# 
# @return 
# Returns a python dictionary containing the following keys:
# - name1: (string) Name of the mushroom.
# - name2 (string) Name given in parenthesis. Can be '' if no such name was given
# - images: (list) A list of image urls
# - info: (dict) A dictionary of information related to the mushroom. 
#       keys: Family, Location, Dimensions, Edibility, Description (dict)
#           Description keys: General, Cap, Gills, Stem
# 
# @examples
# 
# from bs4 import BeautifulSoup
# import requests
# 
# url = 'http://www.mushroom.world/show?n=Galerina-marginata'
# mushroom = scrape_mushroom(url)
# print(mushroom)
# 
def scrape_mushroom(url):
    
    # retrieve site data as BeautifullSoup object
    data  = requests.get(url).text
    soup = BeautifulSoup(data, 'html.parser')
    
    # extract and parse name, labels (Family, Location, Dimensions, Edibility, Description)
    # and content text related to the labels
    name_content = soup.find(class_ = "caption").find("b").contents
    names = re.sub('[^A-Za-z0-9( ]+', '', name_content[0]).split("(")
    names = [n.strip() for n in names]
    name1 = names[0]
    if(len(names) > 1):
        name2 = names[1]
    else:
        name2 = ''

    labels = soup.find_all(class_ ="labelus")
    labels = [label.contents[0] for label in labels]

    texts = soup.find_all(class_ = "textus")
    texts = [text.contents[0] for text in texts]

    # extract mushroom description as a dictionary
    description = soup.find(class_ = "longtextus").contents
    description = [re.sub('[^A-Za-z0-9,.<> ]+', '', str(d)).strip() for d in description]
    description = [re.sub('<b>', '', d) for d in description if (d != "") & (d != "<br>")]
    description.insert(0, 'General')
    description = dict(zip(description[0::2], description[1::2]))

    texts.append(description)
    assert len(labels) == len(texts)
    
    # find image urls
    images = soup.find(id="mushroom-list").find_all(class_ = "image")
    image_urls = ['http://www.mushroom.world' + image.a["href"] for image in images]

    # contruct the mushroom dictionary
    mushroom = dict(name1 = name1, name2 = name2, images = image_urls, info = dict(),)

    # add labels as keys and text as values
    for i in range(len(labels)):
        mushroom["info"][labels[i]] = texts[i]

    return mushroom

def scrape_mushrooms(df):
    return [scrape_mushroom(url) for url in df['url-mw']]

mw_scraped = scrape_mushrooms(test_labels)

In [111]:
IMG_PATH = 'mushroom_img'

import time

def download_mushroom_imgs(df, dict_scraped, path):
    imgs = {}
    for row in df.itertuples():
        # uses name-latin as image's name -> Cantharellus cibarius -> cantharellus_cibarius
        img_name = row[2].lower().replace(' ', '_')
        img_paths = []
        for index, img_url in enumerate(dict_scraped[row.Index]['images']):
            img_data = requests.get(img_url).content
            img_ext = img_url[-3:].lower()
            full_path = os.path.join(path, "{}{}.{}".format(img_name, index, img_ext))
            # download img and save it to full_path
            with open(full_path, 'wb') as handler:
                handler.write(img_data)
            time.sleep(4)
        imgs[row.Index] = img_paths
    return imgs

mw_imgs = download_mushroom_imgs(test_labels[1:2], mw_scraped, IMG_PATH)

In [110]:
#[row for row in test_labels.itertuples()]
print(test_labels[1:2])
dum = {row[1].lower().replace(' ', '_') for row in test_labels.itertuples()}
print(dum)
#'http://www.mushroom.world/data/fungi/Cantharelluscibarius1.JPG'[-3:].lower()



         name-fin                name-latin  \
1  Suppilovahvero  Cantharellus tubaeformis   

                                              url-mw    type  
1  http://www.mushroom.world/show?n=Cantharellus-...  edible  
{'suppilovahvero', 'kangaskärpässieni', 'korvasieni', 'kavalakärpässieni', 'kitkerälahokka', 'keltavahvero', 'lampaankääpä', 'ruskokärpässieni', 'isohapero', 'peltoherkkusieni', 'suippumyrkkyseitikki', 'herkkutatti', 'männynleppärousku', 'punakärpässieni', 'valkokärpässieni', 'kuusenleppärousku', 'myrkkynääpikkä', 'haaparousku'}
