### Run the Database Connection File

In [None]:
%run ./db_connections.ipynb

### Import the Necessary Packages and Libraries

In [1]:
#!pip install pymongo
#!pip install psycopg2
#!pip install wordcloud
import pymongo
import requests
import pandas as pd
import numpy as np
import psycopg2
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sqlalchemy import create_engine

### Load the Data from the API to the MongoDB Collection

In [3]:
def check_and_load_fda_data(api_url, collection):
    if collection.count_documents({}) > 0:
        collection.drop()
        #print("Collection dropped successfully.")
    else:
        print("Collection is empty. Loading data from FDA API...")
    fetch_and_store_fda_data(api_url, collection)
    print("Data loaded successfully from FDA API.")

def fetch_fda_data(api_url):
    try:
        response = requests.get(api_url)
        response.raise_for_status()  # Raise an exception if response has error status code
        json_data = response.json()
        return json_data['meta']['results']['total'], json_data['results']
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching FDA data: {e}")
        return None, None

def store_fda_data(collection, fda_data):
    try:
        collection.insert_many(fda_data)
        #print(f"Stored {len(fda_data)} records in MongoDB")
    except pymongo.errors.PyMongoError as e:
        print(f"Error occurred while storing FDA data in MongoDB: {e}")

def fetch_and_store_fda_data(api_url, collection):
    skip = 0
    fda_data = []
    while True:
        api = f'{api_url}?limit=1000&skip={skip}'
        total, results = fetch_fda_data(api)
        if total is None or results is None:
            break
        if not results:
            print("No more data available from FDA API")
            break
        store_fda_data(collection, results)
        fda_data.extend(results)
        skip += 1000
        if skip >= total:
            break

client, db, collection = connect_to_mongodb(mongoConnectionstring, mongoDatabasename, mongoCollectionname)
check_and_load_fda_data('https://api.fda.gov/food/enforcement.json', collection)

The connection has been established successfully
Collection is empty. Loading data from FDA API...
Data loaded successfully from FDA API.


### Load the Data from the MongoDB Collection to a Pandas Dataframe

In [None]:
#del fda_df_extracted

In [4]:
def load_data_to_dataframe(collection):
    try:
        if collection.count_documents({}) > 0:
            # Retrieve data from MongoDB into a DataFrame
            fda_df_extracted = pd.DataFrame(list(collection.find()))
            print("Data loaded from MongoDB collection into a DataFrame.")
        else:
            print("No data available in MongoDB collection.")
            fda_df_extracted = None
    except Exception as e:
        print(f"Error occurred while loading data from MongoDB: {e}")
        fda_df_extracted = None
    return fda_df_extracted

fda_df_extracted = load_data_to_dataframe(collection)

Data loaded from MongoDB collection into a DataFrame.


In [5]:
#To check if the dataframe is created by displaying the first five rows of the dataframe
fda_df_extracted.head()

Unnamed: 0,_id,country,city,address_1,reason_for_recall,address_2,product_quantity,code_info,center_classification_date,distribution_pattern,...,recall_number,initial_firm_notification,product_type,event_id,more_code_info,recall_initiation_date,postal_code,voluntary_mandated,status,termination_date
0,6443dcc4810d558c77686d94,United States,Davie,4131 SW 47th Ave Ste 1403,Recall initiated as a precautionary measure du...,,"1,990 bottles","UPC No. 632687615989; Lot No. 30661601, Exp. D...",20161025,"FL, MI, MS, and OH.",...,F-0276-2017,Letter,Food,75272,,20160808,33314-4036,Voluntary: Firm initiated,Ongoing,
1,6443dcc4810d558c77686d95,United States,Millbrae,375 Adrian Rd,"Mooncake products, manufactured and distribute...",,"2 cases (1 pc/bx, 48bx/cs)","FG-M1MOT-UW Best by Nov 1, 2016.",20170106,"CA, WA, OR.",...,F-0865-2017,"Two or more of the following: Email, Fax, Lett...",Food,75069,,20160831,94030-3104,Voluntary: Firm initiated,Terminated,20170111.0
2,6443dcc4810d558c77686d96,United States,Miami,13439 NW 19 LANE,Virginia State (VDACS) found Listeria monocyto...,,144 pieces,UPC 635349 000390 Best By dates: 07/01/14 thr...,20141202,"FL, GA. NC, and TN",...,F-0609-2015,"Two or more of the following: Email, Fax, Lett...",Food,69516,,20141010,33182,Voluntary: Firm initiated,Terminated,20170328.0
3,6443dcc4810d558c77686d97,United States,Pompano Beach,2300 NW 19th St,FreshPoint South Florida is recalling sliced f...,,7 cases,Item # 302940.,20120808,Products were distributed in South Florida.,...,F-1922-2012,"Two or more of the following: Email, Fax, Lett...",Food,62750,,20120727,33069-5227,Voluntary: Firm initiated,Terminated,20141008.0
4,6443dcc4810d558c77686d98,United States,Rancho Dominguez,2610 Homestead Pl,Firm was notified by supplier that Organic Gro...,,xx,Lot codes: 72746,20200413,"nationwide, Canada and Netherlands",...,F-0904-2020,"Two or more of the following: Email, Fax, Lett...",Food,85253,,20200224,90220-5610,Voluntary: Firm initiated,Terminated,20210202.0


### Data Cleaning and Pre-Processing

In [6]:
def replace_empty_strings_with_none(df, columns):
    for column in columns:
        df[column].replace('', None, inplace=True)
    return df

# Replace empty strings with None
columns_to_replace_empty_strings = ["address_1","address_2", "more_code_info", "openfda", "_id","city", "country", "product_quantity", "recall_number", "reason_for_recall", "recalling_firm",
                                       "termination_date", "status", "voluntary_mandated", "initial_firm_notification",
                                       "product_type", "classification", "code_info", "state", "postal_code",
                                       "center_classification_date", "distribution_pattern", "product_description",
                                       "report_date", "event_id", "recall_initiation_date"]
df = replace_empty_strings_with_none(fda_df_extracted, columns_to_replace_empty_strings)

In [7]:
def null_value_count_before_processing(df):
    null_values_count=df.isnull().sum()
    print(null_values_count)
    return df

df = null_value_count_before_processing(fda_df_extracted)

_id                               0
country                           0
city                              0
address_1                         2
reason_for_recall                 0
address_2                     22357
product_quantity               1617
code_info                        22
center_classification_date        0
distribution_pattern              0
state                           313
product_description               0
report_date                       0
classification                    0
openfda                           0
recalling_firm                    0
recall_number                     1
initial_firm_notification         7
product_type                      0
event_id                          0
more_code_info                23739
recall_initiation_date            0
postal_code                     327
voluntary_mandated                6
status                            0
termination_date               1553
dtype: int64


In [8]:
def preprocess_fda_data(df):

    def drop_columns(df, columns):
        df = df.drop(columns=columns, axis=1)
        return df

    def drop_na_rows(df, columns):
        df.dropna(subset=columns, inplace=True)
        return df

    def fillna_with_mode(df, column):
        mode_value = df[column].mode().values[0]
        df[column].fillna(mode_value, inplace=True)
        return df

    def fillna_with_unknown(df, columns):
        df[columns] = df[columns].fillna("Unknown")
        return df

    def convert_to_datetime(df, columns):
        for column in columns:
            df[column] = pd.to_datetime(df[column], errors='coerce')
        return df
    
    def extract_digits_from_recall_numbers(df):
        df['recall_number_digits'] = df['recall_number'].str.replace(r'\D', '', regex=True).astype(int)
        return df
    
    # Drop unnecessary columns
    columns_to_drop = ['address_2', 'more_code_info', 'openfda', '_id']
    df = drop_columns(df, columns_to_drop)

    # Drop rows with missing values in specified columns
    columns_with_missing_values = ["recall_initiation_date", "recall_number", "product_quantity", "country", "city", 
                                   "product_quantity", "reason_for_recall", "recalling_firm", "termination_date", "status"]
    df = drop_na_rows(df, columns_with_missing_values)

    # Fill missing values with mode in specified columns
    columns_to_fillna_with_mode = ["voluntary_mandated", "initial_firm_notification", "product_type", "classification"]
    for column in columns_to_fillna_with_mode:
        df = fillna_with_mode(df, column)

    # Fill NaN values with "Unknown" in specified columns
    columns_to_fillna_with_unknown = ["address_1", "code_info", "state", "postal_code", "center_classification_date", "distribution_pattern", "product_description", "report_date", "event_id"]
    df = fillna_with_unknown(df, columns_to_fillna_with_unknown)

    # Convert columns to datetime
    columns_to_convert_to_datetime = ['recall_initiation_date', 'center_classification_date', 'report_date', 'termination_date']
    df = convert_to_datetime(df, columns_to_convert_to_datetime)
    
    # Extract digits from recall numbers and convert to integers
    df = extract_digits_from_recall_numbers(df)
    
    return df

# Update fda_df_extracted with the preprocessed data
fda_df_extracted = preprocess_fda_data(fda_df_extracted)

In [9]:
def null_value_count_after_processing(df):
    null_values_count=df.isnull().sum()
    print(null_values_count)
    return df

df = null_value_count_after_processing(fda_df_extracted)

country                       0
city                          0
address_1                     0
reason_for_recall             0
product_quantity              0
code_info                     0
center_classification_date    0
distribution_pattern          0
state                         0
product_description           0
report_date                   0
classification                0
recalling_firm                0
recall_number                 0
initial_firm_notification     0
product_type                  0
event_id                      0
recall_initiation_date        1
postal_code                   0
voluntary_mandated            0
status                        0
termination_date              0
recall_number_digits          0
dtype: int64


In [29]:
#fda_df_extracted.dtypes

In [17]:
# Convert DataFrame to CSV
#fda_df_extracted.to_csv('fda_df_extracted.csv', index=False)