In [1]:
!pip install webdriver_manager
!pip install Image

###Extracts information about mushrooms from the website http://www.mushroom.world. 
-1.Uses the BeautifulSoup library (from bs4 import BeautifulSoup) to perform HTML analysis of web pages, and the modules requests, re, json, and urljoin to make HTTP requests, handle regular expressions, process JSON data, and manage URLs, respectively. 

-2.Scrape_mushroom_list(url) takes a URL as a parameter, retrieves the HTML content of the page, finds all links pointing to individual mushroom pages, and then uses the function scrape_mushroom(url) to extract specific information about each mushroom. 

-3.The function scrape_mushroom(url) retrieves information such as the name, labels, texts, and images for a specific mushroom from its individual page. 

-4.The script uses a dictionary (edibility_dict) to store information about the edibility of mushrooms, where 'c' stands for edible, and 'p' stands for poisonous. 

-5.The edibility information for each mushroom extracted from the web page is removed since the value is empty, and the edibility is added to the final output each extracted mushroom. 

###Script displays information about all mushrooms in an indented JSON string.


In [2]:
from bs4 import BeautifulSoup 
import requests 
import re 
import json 
from urllib.parse import urljoin 

def scrape_mushroom_list(url): 
data = requests.get(url).text 
soup = BeautifulSoup(data, 'html.parser') 

# Find all links to individual mushroom pages 
mushroom_links = soup.find_all('a', href=re.compile(r'/show?n=')) 
mushroom_urls = [urljoin('http://www.mushroom.world', link['href']) for link in mushroom_links] 

# Retrieve information for each mushroom
mushrooms = [scrape_mushroom(link) for link in mushroom_urls] 

return mushrooms

In [2]:
def scrape_mushroom(url): 
data = requests.get(url).text 
soup = BeautifulSoup(data, 'html.parser') 

# Extract specific information for an individual mushroom 
name_content = soup.find(class_='caption').find('b').contents 
names = re.sub('[^A-Za-z0-9( ]+', '', name_content[0])).split('(')) 
names = [n.strip() for n in names] 
name1 = names[0] 
name2 = names[1] if len(names) > 1 else '' 

labels = soup.find_all(class_='labelus') 
labels = [label.contents[0] for label in labels] 

texts = soup.find_all(class_='textus') 
texts = [text.contents[0] for text in texts] 

# Updated code to remove unwanted lines 
description = soup.find(class_='longtextus') 
if description: 
unwanted_links = description.find_all('a', href=True) 
for link in unwanted_links: 
link.extract()  # Remove unwanted links 

description = description.get_text(separator=' ', strip=True) 
else: 
description = 'Description not available' 

texts.append(description) 
assert len(labels) == len(texts) 

images = soup.find(id='mushroom-list').find_all(class_='image') 
image_urls = [urljoin('http://www.mushroom.world', image.a['href']) for image in images] 

mushroom = dict(name1=name1, name2=name2, images=image_urls, info=dict()) 

for i in range(len(labels)): 
mushroom['info'][labels[i]] = texts[i] 

return mushroom

In [3]:
if __name__ == '__main__': 
# Link to the list of all mushrooms on mushroom.world 
all_mushrooms_url = 'http://www.mushroom.world/mushrooms/namelist' 

# Retrieve information on all mushrooms 
all_mushrooms = scrape_mushroom_list(all_mushrooms_url) 

# Edibility dictionary 
edibility_dict = { 
# ... (your edibility dictionary remains unchanged)
'Agaricus arvensis': 'c', 
'Agaricus augustus': 'c', 
'Agaricus campestris': 'c', 
'Agaricus sylvicola': 'c', 
'Agrocybe pediades': 'c', 
'Agrocybe praecox': 'c', 
'Albatrellus confluens': 'p', 
'Albatrellus ovinus': 'c', 
'Aleuria aurantia': 'p', 
'Amanita battarrae': 'p',
'Amanita bisporigera': 'p',
'Amanita cokeri': 'p',
'Amanita fulva': 'p',
'Amanita jacksonii': 'p',
'Amanita muscaria': 'p',
'Amanita pantherina': 'p',
'Amanita phalloides': 'p',
'Amanita porphyria': 'p',
'Amanita regalis': 'p',
'Amanita rubescens': 'c',
'Amanita vaginata': 'p',
'Amanita virosa': 'p',
'Ampulloclitocybe clavipes': 'p',
'Armillaria mellea': 'c',
'Auriscalpium vulgare': 'p',
'Bankera fuligineoalba': 'p',
'Boletus edulis': 'c',
'Boletus pinophilus': 'c',
'Bondarzewia berkeleyi': 'p',
'Bovista nigrescens': 'p',
'Bovista plumbea': 'p',
'Calocera viscosa': 'p',
'Calocybe gambosa': 'p',
'Calocybe persicolor': 'p',
'Calvatia gigantea': 'p',
'Cantharellula umbonata': 'p',
'Cantharellus cibarius': 'c',
'Chalciporus piperatus': 'p',
'Chlorophyllum molybdites': 'p',
'Chlorophyllum rhacodes': 'p',
'Chroogomphus britannicus': 'p',
'Clathrus ruber': 'p',
'Conocybe apala': 'p',
'Coprinellus disseminatus': 'p',
'Coprinellus xanthothrix': 'p',
'Coprinopsis atramentaria': 'p',
'Coprinopsis variegata': 'p',
'Conocybe apala': 'p',
'Coprinopsis variegata': 'p',
'Coprinus comatus': 'p',
'Coprinus plicatilis': 'p',
'Cortinarius alboviolaceus': 'p',
'Cortinarius armillatus': 'p',
'Cortinarius camphoratus': 'p',
'Cortinarius caperatus': 'p',
'Cortinarius collinitus': 'p',
'Cortinarius croceus': 'p',
'Cortinarius laniger': 'p',
'Cortinarius malicorius': 'p',
'Cortinarius mucosus': 'p',
'Cortinarius orellanus': 'p',
'Cortinarius rubellus': 'p',
'Cortinarius semisanguineus': 'p',
'Cortinarius traganus': 'p',
'Cortinarius violaceus': 'p',
'Craterellus tubaeformis': 'c',
'Cystoderma amianthinum': 'p',
'Cystodermella cinnabarina': 'p',
'Entoloma sericeum': 'p',
'Entoloma vernum': 'p',
'Galerina marginata': 'p',
'Geastrum rufescens': 'p',
'Gomphidius glutinosus': 'p',
'Gymnopilus picreus': 'p',
'Gymnopus peronatus': 'p',
'Gyromitra esculenta': 'p',
'Gyromitra esculenta': 'p',
'Gyromitra infula': 'p',
'Hebeloma crustuliniforme': 'p',
'Hebeloma mesophaeum': 'p',
'Helvella elastica': 'p',
'Hericium americanum': 'p',
'Hericium cirrhatum': 'p',
'Hericium erinaceus': 'p',
'Hortiboletus rubellus': 'p',
'Hydnum repandum': 'p',
'Hydnum rufescens': 'p',
'Hygrophoropsis aurantiaca': 'p',
'Hygrophorus camarophyllus': 'p',
'Hygrophorus hypothejus': 'p',
'Hygrophorus hypothejus': 'p',
'Hygrophorus pustulatus': 'c',
'Hypholoma capnoides': 'p',
'Hypholoma fasciculare': 'p',
'Hypholoma lateritium': 'p',
'Hypholoma lateritium': 'p',
'Hypholoma marginatum': 'p',
'Imleria badia': 'p',
'Inocybe lacera': 'p',
'Kuehneromyces lignicola': 'p',
'Kuehneromyces mutabilis': 'p',
'Laccaria laccata': 'p',
'Lacrymaria lacrymabunda': 'p',
'Lactarius camphoratus': 'p',
'Lactarius deliciosus': 'c',
'Lactarius deterrimus': 'p',
'Lactarius helvus': 'p',
'Lactarius indigo': 'p',
'Lactarius lignyotus': 'p',
'Lactarius mammosus': 'p',
'Lactarius rufus': 'p',
'Lactarius tabidus': 'p',
'Lactarius torminosus': 'p',
'Lactarius trivialis': 'p',
'Lactarius turpis': 'p',
'Lactarius volemus': 'p',
'Lactifluus piperatus': 'p',
'Laetiporus sulphureus': 'p',
'Leccinum aurantiacum': 'p',
'Leccinum scabrum': 'p',
'Leccinum versipelle': 'p',
'Lepiota clypeolaria': 'p',
'Leucocoprinus birnbaumii': 'p',
'Leucocybe connata': 'p',
'Lycoperdon excipuliforme': 'p',
'Lycoperdon nigrescens': 'p',
'Lycoperdon perlatum': 'p',
'Lycoperdon pratense': 'p',
'Lycoperdon pyriforme': 'p',
'Macrolepiota procera': 'p',
'Marasmiellus perforans': 'p',
'Marasmius oreades': 'p',
'Melanoleuca cognata': 'p',
'Morchella elata': 'p',
'Morchella esculenta': 'p',
'Mycena epipterygia': 'p',
'Mycena galericulata': 'p',
'Mycena laevigata': 'p',
'Mycena pura': 'p',
'Omphalotus illudens': 'p',
'Omphalotus olearius': 'p',
'Otidea onotica': 'p',
'Panaeolus foenisecii': 'p',
'Paxillus involutus': 'p',
'Peziza badia': 'p',
'Phallus impudicus': 'p',
'Phallus rubicundus': 'p',
'Pholiota alnicola': 'p',
'Pholiota aurivella': 'p',
'Pholiota limonella': 'p',
'Pholiota squarrosa': 'p',
'Pleurotus citrinopileatus': 'p',
'Pleurotus ostreatus': 'c',
'Pleurotus pulmonarius': 'c',
'Polyporus ciliatus': 'p',
'Polyporus squamosus': 'p',
'Psathyrella candolleana': 'c',
'Psathyrella microrrhiza': 'p',
'Psathyrella sp.': 'p',
'Psilocybe semilanceata': 'p',
'Rickenella swartzii': 'c',
'Rubroboletus satanas': 'p',
'Russula acrifolia': 'p', 
'Russula adusta': 'p', 
'Russula aeruginea': 'p',
'Russula claroflava': 'p',
'Russula paludosa': 'c',
'Russula velenovskyi': 'p',
'Russula vesca': 'c',
'Russula vinosa': 'c',
'Russula xerampelina': 'c',
'Sarcodon squamosus': 'p',
'Strobilomyces strobilaceus': 'p',
'Strobilurus esculentus': 'c',
'Strobilurus stephanocystis': 'p',
'Stropharia hornemannii': 'p',
'Suillus americanus': 'c', 
'Suillus bovinus': 'c', 
'Suillus grevillei': 'p', 
'Suillus luteus': 'c', 
'Suillus variegatus': 'p', 
'Tapinella atrotomentosa': 'c', 
'Tapinella panuoides': 'p', 
'Tricholoma aestuans': 'p', 
'Tricholoma equestre': 'c', 
'Tricholoma focale': 'p', 
'Tricholoma saponaceum': 'p', 
'Tricholoma sejunctum': 'p', 
'Tricholoma stiparophyllum': 'p', 
'Tricholomopsis decora': 'p', 
'Tricholomopsis rutilans': 'p', 
'Turbinellus floccosus': 'p', 
'Tylopilus felleus': 'p', 
'Xerocomellus chrysenteron': 'c', 
'Xerocomus subtomentosus': 'c', 
'Xeromphalina campanella': 'p', 
'Tricholomopsis decora': 'p', 
'Clitocybe gibba': 'c', 
'Clitocybe nuda': 'c', 
'Clitopilus prunulus': 'c', 
'Collybia dryophila': 'c', 
'Coltricia perennis': 'p', 
'Conocybe apalas': 'p', 
'Marasmius rotula': 'p', 
'Tricholomataceae': 'c', 
'Ramaria lutea': 'p', 
'Rhizina undulata': 'p', 
'Russula decolorans': 'c', 
'Russula emetica': 'p', 
'Russula mustelina': 'c', 
'Clitocybe nebularis': 'p', 
}

# Add edibility to final output 
for mushroom in all_mushrooms: 
name = mushroom['name1'] 
if name in edibility_dict: 
mushroom['comestibility'] = edibility_dict[name] 
else: 
mushroom['comestibility'] = 'Information not available' 
# Delete edibility information from web page 
if 'Edibility' in mushroom['info']: 
del mushroom['info']['Edibility'] 

# Display information (or save to file, etc.) 
print(json.dumps(all_mushrooms, indent=2))

 ## Uses the Python libraries pandas and Pillow (PIL) to process data from a JSON file containing information about mushrooms. 

 ### 1.Import Libraries : 

-import pandas as pd : Imports the pandas library and aliases it as pd. 

-from PIL import Image: Imports the Image module from the Pillow library for working with images. 

 ### 2.File Path and Data Loading: 

-file_path = 'champiURL.json': Specifies the file path for the JSON file containing mushroom information. 

-df = pd.read_json(file_path): Reads the JSON file into a pandas DataFrame. 

 ### 3.Data Processing: 

-df = df.explode('images', ignore_index=True): Explodes the 'images' column, which likely contains lists of image URLs, into separate rows while ignoring the index. This is often done to handle nested lists in DataFrames. 

-df = df.drop(columns=['info']): Drops the 'info' column from the DataFrame. 

 ### 4.Mapping Comestibility: 

-Dico = {'c': 1, 'p': 0}: Creates a dictionary mapping comestibility codes ('c' for edible, 'p' for poisonous) to numerical values. 

(1 : c, 0 : p). 

-df['comestibility'] = df['comestibility'].map(Dico): Maps the values in the 'comestibility' column using the defined dictionary, converting comestibility codes to numerical values. 

 ### 5.Display DataFrame: 

-df: Displays the modified DataFrame, which now includes numerical values for comestibility.

In [4]:
import pandas as pd 
from PIL import Image 

file_path = 'champiURL.json' 
df = pd.read_json(file_path) 
df = df.explode('images', ignore_index = True) 
df = df.drop(columns=['info']) 

Dico = { 
'c' : 1, 
'p' : 0 
} 

df['comestibility'] = df['comestibility'].map(Dico) 
df

###This code essentially fetches images from URLs in the DataFrame, converts them to base64 format, and stores the encoded images in the encoded_images list. 

 ### 1.Imports: 

-from io import BytesIO: Imports the BytesIO class from the io module, which is used to create in-memory binary streams. 

-import base64: Imports the base64 module for encoding and decoding base64 data. 

-import requests: Imports the requests module for making HTTP requests. 

 ### 2.Empty List Initialization: 

-encoded_images = []: Initializes an empty list named encoded_images to store base64-encoded images. 

 ### 3.Loop Through DataFrame and Fetch Images: 

-code iterates through each row of the DataFrame (df) using a loop. 

-For each iteration: 

-It retrieves the image URL from the 'images' column of the DataFrame. 

-Makes a GET request to the URL using requests.get(url). 

-If successful (status code 200), it opens the image using PIL (Pillow), converts it to base64 using base64.b64encode(), and appends the encoded image to the encoded_images list. 

 ### 4.Printing Progress: 

-if i % 50 == 0:: Prints the progress every 50 iterations, indicating the number of images processed out of the total.

In [5]:
from io import BytesIO 
import base64 
import requests 

encoded_images = [] 

for i in range(len(df)): 
# Make a GET request to the URL 
url = df['images'][i] 
response = requests.get(url) 
response.raise_for_status()  # Check if the request was successful 

if response.status_code == 200: 
# Convert the image to base64 
image = Image.open(BytesIO(response.content)) 
img_base64 = base64.b64encode(response.content).decode('utf-8') 
encoded_images.append(img_base64) 
else: 
# If the request was not successful, print an error message 
print(f'Error: Unable to fetch image from {url}. Status code: {response.status_code}') 

if i%50 == 0: 
print(f'Image {i}/{len(df)}') 


 ### Creates a new column in the DataFrame df called 'encoded_images' and populates it with the contents of the ist encoded_images. 

Contains base64-encoded images. 


In [6]:
df['encoded_images'] = encoded_images 
df

 ### Creation of a csv 


In [7]:
df.to_csv('champipiFinal.csv')