In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import re
import lxml
from urllib.request import urlopen
from IPython.display import display
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Steps to web-scrape a food recalls website for its recall data.

-Inspect all of the interested web elements of the webpage and note their HTML tags.
-Initiate an empty list for each web element.
-While loop over web pages in site based on number rule
    -Establish beautifulsoup object
    -Iterate over the nested webparts that are contained within the larger ones and append to initiated lists
-Turn initiated lists into Pandas Series objects to enable Pandas value counts
-Test out the value counts
-Create data dictionary of "Column_Names" : series_objects established in previous step
-Turn dictionary into pandas dataframe
-Query data frame to ensure table is adequate


In [205]:
status_ = []
title_ = []
reason_ = []
date_ = []
summary_ = []
impacted_products_ = []
link_ = []
page = 0
while page!= 25:
    url = f"https://www.fsis.usda.gov/recalls?page={page}"
    response = requests.get(url)
    html = response.content
    soup = bs(html, "lxml")
    view_content = soup.find('div', class_='view__content')
    view_rows = view_content.find_all('div', class_='view__row')
    for row in view_rows:
        status = row.find('div', class_="recall-teaser__status").text.replace('\n',' ')
        if "Active" in status:
            view_status = row.find('span', class_="tag tag--active").text
            status_.append(status)
            teaser_title = row.find('h3', class_='recall-teaser__title').text
            title_.append(teaser_title)
            reason = row.find('a', class_="tag tag--reason").text
            reason_.append(reason)
            date = row.find('div', class_="recall-teaser__date").text.replace('\n','')
            date_.append(date)
            summary = row.find('div', class_="recall-teaser__summary").text.replace('\n','')
            summary_.append(summary_)
            impacted_products = row.find('div', class_="recall-teaser__products").text
            impacted_products_.append(impacted_products)
            link = row.find("a", href=re.compile("^(/recalls-alerts/)((?!:).)*$"))
            link_.append(link)

        # print(f'''
        # Status:{status}
        # Date: {date}
        # Reason: {reason}
        # Title: {teaser_title}
        # Summary: {summary}
        # {ip}
        # Link: https://www.fsis.usda.gov{link.attrs['href']}
        # ''')

    page = page + 1

links=[]
for link in link_:
    item = f"https://www.fsis.usda.gov{link.attrs['href']}"
    links.append(item)



# sub_strings = ['Impacted Products', '\\n', '\\t']

# impacted_products_final = []

# for item in impacted_products_:
#     item = re.sub('|'.join(sub_strings), '', item)
#     impacted_products_final.append(item)




In [213]:
status_series = pd.Series(status_)
status_series.value_counts()
title_series = pd.Series(title_)
reason_series = pd.Series(reason_)
reason_series.value_counts()
date_series = pd.Series(date_)
date_series.value_counts()
impacted_products_series = pd.Series(impacted_products_final)
link_series = pd.Series(links)

data_dictionary = {"Status" : status_series, "Title" : title_series, "Reason" : reason_series, "Date" : date_series, "Impacted_Products" : impacted_products_series, "Link" : link_}
df = pd.DataFrame(data=data_dictionary)

ds = df.loc[(df['Reason'] == 'Product Contamination') | (df['Reason'] == 'Unfit for Human Consumption') | (df['Reason'] == 'Unfit for Human Consumption')]

def remove_substrings(str_list, substr_list):
    new_list = []
    for item in str_list:
        item = re.sub('|'.join(substr_list), '', item)
        new_list.append(item)
    return new_list

sub_strings = ['Impacted Products', '\\n', '\\t']

df["Impacted_Products"] = remove_substrings(df["Impacted_Products"],sub_strings) 

ds[["Reason", "Title", 'Impacted_Products','Link']]

 Active     93
Name: count, dtype: int64

Product Contamination                     33
Misbranding                               28
Produced Without Benefit of Inspection    11
Import Violation                           8
Unfit for Human Consumption                3
Processing Defect                          3
Unreported Allergens                       3
Insanitary Conditions                      2
Mislabeling                                2
Name: count, dtype: int64

        Sat, 12/18/2021 - Current          2
        Fri, 02/03/2023 - Current          2
        Fri, 06/30/2023 - Current          2
        Fri, 10/29/2021 - Current          1
        Fri, 03/12/2021 - Current          1
                                          ..
        Tue, 01/31/2023 - Current          1
        Wed, 02/01/2023 - Current          1
        Wed, 02/08/2023 - Current          1
        Tue, 02/14/2023 - Current          1
        Fri, 11/22/2019 - Current          1
Name: count, Length: 90, dtype: int64

Unnamed: 0,Reason,Title,Impacted_Products,Link
2,Product Contamination,FSIS Issues Public Health Alert for Fresh Sala...,•8.65-oz. plastic film packages containing “Ch...,[FSIS Issues Public Health Alert for Fresh Sal...
4,Product Contamination,FSIS Issues Public Health Alert for Raw Beef P...,•Approximately 1.5-lb. plastic tray packages c...,[FSIS Issues Public Health Alert for Raw Beef ...
11,Unfit for Human Consumption,House of Raeford Recalls Foster Farms Brand Mi...,•1.83-lb. resealable bag containing “FOSTER FA...,[House of Raeford Recalls Foster Farms Brand M...
13,Product Contamination,"Conagra Brands, Inc., Recalls Frozen Beef Shep...",4.31 lbs. cases of “MC Beef Shepherd’s Pie” wi...,"[Conagra Brands, Inc., Recalls Frozen Beef She..."
14,Product Contamination,"Johnsonville, LLC, Recalls Beddar With Cheddar...",•14-oz. vacuum-packed packages of “Johnsonvill...,"[Johnsonville, LLC, Recalls Beddar With Chedda..."
21,Product Contamination,FSIS Issues Public Health Alert for Fresh Sala...,•5.5-oz. clear plastic packages containing “Fr...,[FSIS Issues Public Health Alert for Fresh Sal...
22,Product Contamination,"Hoyo, SBC Recalls Frozen, Ready-To-Eat Beef Sa...",Bulk boxes containing 75 pieces of “HOYO Beef ...,"[Hoyo, SBC Recalls Frozen, Ready-To-Eat Beef S..."
34,Product Contamination,FSIS Issues Public Health Alert for Ground Bee...,•Varying weights of ground beef packed in butc...,[FSIS Issues Public Health Alert for Ground Be...
35,Product Contamination,FSIS Issues Public Health Alert for Fully Cook...,•1-lb. chubs containing “JET HIGH PRAIRIE MEAT...,[FSIS Issues Public Health Alert for Fully Coo...
36,Product Contamination,FSIS Issues Public Health Alert for Ground Be...,•1-lb. plastic vacuum-packed packages containi...,[ FSIS Issues Public Health Alert for Ground B...


In [215]:
from collections import Counter
import collections
import itertools
def word_count_dict_from_text_col(column_or_strlist):
    words = []
    for item in column_or_strlist:
        item = item.lower().split()
        words.append(item)           
    words = list(itertools.chain.from_iterable(words))
    counts = collections.Counter(x for x in words if x)
    return counts
w = word_count_dict_from_text_col(df["Reason"])
w
word_count_df = pd.DataFrame(data=w, index=[0])

Counter({'product': 33,
         'contamination': 33,
         'misbranding': 28,
         'produced': 11,
         'without': 11,
         'benefit': 11,
         'of': 11,
         'inspection': 11,
         'import': 8,
         'violation': 8,
         'unfit': 3,
         'for': 3,
         'human': 3,
         'consumption': 3,
         'processing': 3,
         'defect': 3,
         'unreported': 3,
         'allergens': 3,
         'insanitary': 2,
         'conditions': 2,
         'mislabeling': 2})