#WEB SCRAPING: INITIAL STEPS

**Import libraries.**

In [85]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

**Send request and get response.**

In [86]:
url = "https://www.fda.gov/drugs/drug-safety-and-availability/drug-recalls"
response = requests.get(url)

**Inspect the response.**

In [None]:
#get the status code
response.status_code 

In [None]:
#get the response text
response.text 

In [None]:
response.content

**Parse the response with Beautiful Soup.**

In [98]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.title.text) #test

Drug Recalls | FDA


**To view HTML in a better way...**

In [None]:
#use prettify
print(soup.prettify()) 

#WEB SCRAPING: EXTRAPOLATING THE FULL TABLE

**Create a dataframe of the full table.**

In [None]:
target_table = soup.find_all("table", attrs={"id": "datatable"})
target_table

In [108]:
#Create the header/column names
full_table = target_table[0]
rows = full_table.find_all("tr")
head_row = rows[0] 
other_rows = rows[1:] 

#Loop through all header elements, convert them to text, and strip/remove "\n"
#and then append to create an empty list with headers defined
headings = []
for item in head_row.find_all("th"): 
    item = (item.text).rstrip("\n")
    headings.append(item)
print(headings)

['Date ', 'Brand Name(s) ', 'Product Description ', 'Recall Reason Description', 'Company Name ', 'Terminated Recall', 'Excerpt']


In [109]:
#Next, fill in the rows with the info by looping through all the entries
#and removing tags, \xa0 and \n and comma from the text
all_rows = [] 
for row_num in range(len(other_rows)): 
    row = [] 
    for row_item in other_rows[row_num].find_all("td"): 
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        row.append(aa)

    all_rows.append(row)

In [113]:
#Create the full table
df = pd.DataFrame(data=all_rows,columns=headings)
df.head()

Unnamed: 0,Date,Brand Name(s),Product Description,Recall Reason Description,Company Name,Terminated Recall,Excerpt
0,04/12/2022,Mylan,Insulin Glargine (Insulin glargine-yfgn) Injec...,Label may be missing on some vials ...,Mylan Pharmaceuticals Inc. a Viatris Company ...,,
1,04/01/2022,Mickey Mouse The Mandalorian,Hand Sanitizer,Presence of Methanol in Mickey Mouse and Prese...,Best Brands Consumer Products Inc. ...,,
2,04/01/2022,F&S Medical Supply dba Pink Toyz,Pink Pussycat Capsules,Undeclared Sildenafil,F&S Medical Supply dba Pink Toyz,,
3,03/30/2022,Suave,Aerosol Antiperspirants,Elevated levels of benzene,Unilever,,
4,03/29/2022,Teva Pharmaceuticals,IDArubicin Hydrochloride Injection USP ...,Potential Particulate Matter (silica and iron ...,Teva Pharmaceuticals,,


In [138]:
#You can drop empty/unnecessary columns such as Terminated Recall and Excerpt to have a cleaner view
df.drop(columns=['Terminated Recall','Excerpt'])

Unnamed: 0,Date,Brand Name(s),Product Description,Recall Reason Description,Company Name
0,04/12/2022,Mylan,Insulin Glargine (Insulin glargine-yfgn) Injec...,Label may be missing on some vials ...,Mylan Pharmaceuticals Inc. a Viatris Company ...
1,04/01/2022,Mickey Mouse The Mandalorian,Hand Sanitizer,Presence of Methanol in Mickey Mouse and Prese...,Best Brands Consumer Products Inc. ...
2,04/01/2022,F&S Medical Supply dba Pink Toyz,Pink Pussycat Capsules,Undeclared Sildenafil,F&S Medical Supply dba Pink Toyz
3,03/30/2022,Suave,Aerosol Antiperspirants,Elevated levels of benzene,Unilever
4,03/29/2022,Teva Pharmaceuticals,IDArubicin Hydrochloride Injection USP ...,Potential Particulate Matter (silica and iron ...,Teva Pharmaceuticals
5,03/24/2022,Major Pharmaceuticals,Magnesia Oral Suspension 2400 mg/30 mL Magnesi...,Microbial Contamination,Plastikon Healthcare LLC
6,03/22/2022,Sandoz,Orphenadrine Citrate 100 mg Extended Release (...,Presence of a Nitrosamine Impurity ...,Sandoz Inc.
7,03/22/2022,Adamis Pharmaceuticals Corporation,SYMJEPI (epinephrine) Injection 0.15 mg (0.15 ...,Potential clogging of the needle preventing th...,Adamis Pharmaceuticals Corporation ...
8,03/22/2022,Accuretic Greenstone Brand,Accuretic™ (quinapril HCl/hydrochlorothiazide)...,Presence of a nitrosamine N-nitroso-quinapril ...,Pfizer
9,03/10/2022,Olympia Pharmaceuticals,Compounded Injectables,Products are out of specification,Olympia Pharmacy


In [139]:
df.shape #(rows,columns)
#Notice that after dropping the two columns, it still shows we have 7 columns
#That's because it drops and NOT eliminate completely

(10, 7)

**Save the results.**

In [137]:
#save as Excel to Google Drive from Colab
from google.colab import drive
drive.mount('/gdrive')
df.to_excel("/gdrive/My Drive/WEB SCRAPING - FDA Drug Recalls Table.xlsx")

Mounted at /gdrive
