#Data Preparation
This notebook includes the necessary code to:
*   Import and preprocess the existing tabular dataset
*   Make http requests to each recall report to pull all raw html data
*   Process all raw html data into raw text data
*   Parse text data and extract meaningful phrases to create textual dataset
*   Save all data and objects necessary to reproduce results again more efficiently
*   Augment existing data and textual data into dataset for analysis



In [1]:
import warnings
warnings.simplefilter(action='ignore')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive
import requests
from bs4 import BeautifulSoup as bs
import pickle
import sys

In [2]:
drive.mount("/content/drive") #mount google drive to load data
recalls = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/recalls.csv") #load data from csv to dataframe
#can similarly load data from local csv file

Mounted at /content/drive


In [None]:
#recalls.columns #columns of raw imported data

In [3]:
#convert dates to datetime
recalls["start_date"] = pd.to_datetime(recalls["start_date"])
recalls["end_date"] = pd.to_datetime(recalls["end_date"])

In [None]:
#count per year
#start_groups = recalls["start_date"].groupby(recalls.start_date.dt.year).value_counts().sum(level=0)

#start_groups.plot.line(ylabel="count", xlabel="start date year", rot="45");
#[(d, start_groups[d]) for d in start_groups.index]
#np.mean(start_groups)

In [None]:
#CELL ONLY RUN ONCE FIRST TIME 

#make requests and save data of request reponses for each recall report

recall_requests = []

#do not use ipv6, lead to faster request time
requests.packages.urllib3.util.connection.HAS_IPV6 = False
#start session to make many consequtive requests
session = requests.Session()

#make requests to gather text data from each recall report url
for i, url in enumerate(recalls["url"]):
  res = session.get(url)
  recall_requests.append(res)
  
  #print(i)

#save the list of request response objects as .dat file to load in later
#so do not need to wait hours to make requests each time
with open("/content/drive/MyDrive/Colab Notebooks/recall_requests.dat", "wb") as f:
        pickle.dump(recall_requests, f)

In [None]:
#USE THIS AFTER REQUESTS ARE MADE

#load the saved request response objects
loaded_requests = []
try:
  #can also use local file path
  with open("/content/drive/MyDrive/Colab Notebooks/recall_requests.dat", "rb") as f:
      loaded_requests = pickle.load(f)
except:
    print("unable to load requests data")

In [None]:
len(loaded_requests) #check length of loaded reqeusts is 1338, equal to number of recalls

1338

In [None]:
#CELL ONLY RUN ONCE FIRST TIME 

#parse the title of each recall to extract text data
locations, products, reasons = [], [], []

for req in loaded_requests:
  soup = bs(req.text, "html.parser")
  #print(soup.title.text)
  #convert to all lower case and split
  title_tokens = str(soup.title.text.lower()).split()

  #some titles have no useful information
  if title_tokens[:3] == ["recall", "notification", "report"]:
    locations.append("")
    products.append("")
    reasons.append("")
  else:
    location, product, reason = "", "", ""

    #get location or state data about firm if available
    try:
      location = " ".join(title_tokens[:title_tokens.index("firm")])
    except:
      pass 
    locations.append(location)

    #get information about the specific product recalled if available
    try:
      product = " ".join(title_tokens[title_tokens.index("recalls")+1: title_tokens.index("due")])
    except:
      try:
        product = " ".join(title_tokens[title_tokens.index("recalls")+1: title_tokens.index("that")])
      except:
        try:
          product = " ".join(title_tokens[title_tokens.index("recalls")+1: title_tokens.index("products")+1])
        except:
          try:
            product = " ".join(title_tokens[title_tokens.index("for")+1: title_tokens.index("products")+1])
          except:
            try:
              product = " ".join(title_tokens[title_tokens.index("for")+1: title_tokens.index("due")])
            except:
              pass
    products.append(product)
      
    #get specific recall reason data if available
    try:
      reason = " ".join(title_tokens[title_tokens.index("to")+1: title_tokens.index("|")])
    except:
      try:
        reason = " ".join(title_tokens[title_tokens.index("that")+1: title_tokens.index("|")])
      except:
        try:
          reason = " ".join(title_tokens[title_tokens.index("products")+1: title_tokens.index("|")])
        except:
          pass
    reasons.append(reason)

  #print(location,";", product, ";", reason)


#save parsed text data, parsing and text data gathering only needs to be run once
#save each list of text data as .dat file to be loaded and used later
#can also save as .csv to read 
with open("/content/drive/MyDrive/Colab Notebooks/locations.dat", "wb") as f:
  pickle.dump(locations, f)

with open("/content/drive/MyDrive/Colab Notebooks/products.dat", "wb") as f:
  pickle.dump(products, f)

with open("/content/drive/MyDrive/Colab Notebooks/reasons.dat", "wb") as f:
  pickle.dump(reasons, f)

with open("/content/drive/MyDrive/Colab Notebooks/locations.csv", "w") as f:
  for item in locations:
    f.write(item + ",")

with open("/content/drive/MyDrive/Colab Notebooks/products.csv", "w") as f:
  for item in products:
    f.write(item + ",")

with open("/content/drive/MyDrive/Colab Notebooks/reasons.csv", "w") as f:
  for item in reasons:
    f.write(item + ",")

In [None]:
#CELL ONLY RUN ONCE FIRST TIME 

with open("/content/drive/MyDrive/Colab Notebooks/raw_text.txt", "w") as f:
  for req in loaded_requests:
    f.write(req.text + "\n")
#save raw text to txt file

In [4]:
locations, products, reasons = [], [], []
#load parsed text data from saved files
try:
  with open("/content/drive/MyDrive/Colab Notebooks/locations.dat", "rb") as f:
    locations = pickle.load(f)
except:
  print("unable to load locations data")

try:
  with open("/content/drive/MyDrive/Colab Notebooks/products.dat", "rb") as f:
    products = pickle.load(f)
except:
  print("unable to load products data")

try:
  with open("/content/drive/MyDrive/Colab Notebooks/reasons.dat", "rb") as f:
    reasons = pickle.load(f)
except:
  print("unable to load reasons data")

#make sure each one is correct length loaded
print(len(locations), len(products), len(reasons))

1338 1338 1338


In [5]:
#augment generated text data with existing data

products_df = pd.DataFrame(products)
locations_df = pd.DataFrame(locations)
reasons_df = pd.DataFrame(reasons)

augmented_df = pd.concat([recalls[["start_date", "end_date", "risk_level", "quantity_recovered", "url", "states"]], products_df, reasons_df], axis=1)

#save as csv to use in analysis
augmented_df.to_csv("/content/drive/MyDrive/Colab Notebooks/augmented_dataset.csv")