### Imports

In [1]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, CollectionInvalid
import pandas as pd
from pandas import json_normalize
import numpy as np
import psycopg2
from psycopg2.errors import *
from sqlalchemy import create_engine
import ipywidgets as widgets
import plotly.graph_objects as go
from IPython.core.display import display
from re import search
from plotly.subplots import make_subplots
import logging

In [2]:
logging.basicConfig(filename='yelp_business_errors.log', level=logging.INFO)

### Getting Data and Dumping in MongoDB

In [3]:
def mongo_connection(db_name, collection_name):
    '''Function to create a connection with MongoDB
    INPUTS: The function expects two input variables, namely:
        db_name: Name of the Database holding the collection
        collection_name: Name of the collection to retrieve
    OUTPUT:
        collection: An object of the given collection name
    '''
    try:
        client = MongoClient('mongodb://localhost:27017')
        db = client[db_name]
        collection = db[collection_name]
        return collection
    
    except CollectionInvalid:
        print('Unable to connect to Database, check Database name!')
        logging.error("Exception occurred at mongo_connection", exc_info=True)
    except CollectionInvalid:
        print("Invalid Collection Name provided, check Collection name!")
        logging.error("Exception occurred at mongo_connection", exc_info=True)
    except Exception as e:
        print('Something went wrong check logs for more info!')
        logging.error("Exception occurred at mongo_connection", exc_info=True)
    

In [4]:
collection = mongo_connection('dap', 'drugs_recalled')

In [10]:
import requests
import time
for skip_records in range(0,13000, 1000):
    url = 'https://api.fda.gov/drug/enforcement.json?api_key=jzBDRKRffIUITo0uKg9EEWqasttW2cDni2b7ncUB&limit=1000&skip={0}'.format(skip_records)
    response = requests.get(url)
    json_to_dump = response.json()['results']
    collection.insert_many(json_to_dump)
    
    


{'error': {'code': 'NOT_FOUND', 'message': 'No matches found!'}}


In [5]:
drugs_recalled_df = json_normalize(collection.find())

In [35]:
temp_df.columns

Index(['_id', 'country', 'city', 'address_1', 'reason_for_recall', 'address_2',
       'product_quantity', 'code_info', 'center_classification_date',
       'distribution_pattern', 'state', 'product_description', 'report_date',
       'classification', 'recalling_firm', 'recall_number',
       'initial_firm_notification', 'product_type', 'event_id',
       'termination_date', 'recall_initiation_date', 'postal_code',
       'voluntary_mandated', 'status', 'openfda.application_number',
       'openfda.brand_name', 'openfda.generic_name',
       'openfda.manufacturer_name', 'openfda.product_ndc',
       'openfda.product_type', 'openfda.route', 'openfda.substance_name',
       'openfda.rxcui', 'openfda.spl_id', 'openfda.spl_set_id',
       'openfda.package_ndc', 'openfda.is_original_packager', 'openfda.upc',
       'openfda.nui', 'openfda.pharm_class_epc', 'openfda.unii', 'brand_name'],
      dtype='object')

In [50]:
temp_df = drugs_recalled_df[~drugs_recalled_df['openfda.brand_name'].isnull()]

In [10]:
temp_df.head()

Unnamed: 0,_id,country,city,address_1,reason_for_recall,address_2,product_quantity,code_info,center_classification_date,distribution_pattern,...,openfda.package_ndc,openfda.is_original_packager,openfda.upc,openfda.nui,openfda.pharm_class_epc,openfda.pharm_class_cs,openfda.unii,openfda.pharm_class_moa,openfda.pharm_class_pe,openfda.original_packager_product_ndc
5,6075b50e8d54e9e57d563f53,United States,Davie,4955 Orange Dr,Failed Tablet/Capsule Specifications: Recall d...,,"36,538 bottles","Lot #: 605956A, 605958A, Exp. 9/30/2014; 64467...",20140930,Nationwide and Puerto Rico,...,"[0591-0397-19, 0591-0397-60, 0591-0398-60]",[True],"[0305910397609, 0305910398606, 305910398606]","[N0000175785, M0017811]",[Prostaglandin E1 Analog [EPC]],"[Prostaglandins E, Synthetic [CS]]","[0E43V0BB57, QTG126297Q]",,,
16,6075b50e8d54e9e57d563f5e,United States,Lake Forest,275 N Field Dr,Lack of assurance of sterility: ineffective cr...,,"72,300 vials","Lot: 23-505-EV, Exp. 11/14",20130828,Nationwide and Puerto Rico,...,"[0409-3213-11, 0409-3213-12]",[True],,"[N0000175694, M0002356]",[Benzodiazepine [EPC]],[Benzodiazepines [CS]],[Q3JTX2Q7TU],,,
23,6075b50e8d54e9e57d563f65,United States,Hawthorne,3 Skyline Dr,Failed Content Uniformity Specifications.,,1396 Bottles,"Lot #: 149400, Expiry: January 2016; Lot #: ...",20140912,"United States including: OH, HI, NH, MS, IL, T...",...,"[51672-4027-1, 51672-4027-3, 51672-4027-7, 516...",[True],"[0351672403119, 0351672403010, 0351672403218, ...",,,,[6153CWM0CL],,,
28,6075b50e8d54e9e57d563f6a,United States,Bridgewater,400 Somerset Corporate Blvd,Failed Dissolution Specifications: high out of...,,22 bottles,"Lots: a)18J020P, Exp 08/2020; b) 18J034P, Exp ...",20190214,Nationwide in the USA and Puerto Rico,...,"[0187-0795-30, 0187-0795-42, 0187-0795-49, 018...",[True],,,,,[OLH94387TE],,,
29,6075b50e8d54e9e57d563f6b,United States,Princeton,107 College Rd E,Failed dissolution specifications - low dissol...,,12132 cartons,"Lot # KB50877, KB50878, Exp 11/17; KB50878, 01...",20170302,U.S. Nationwide,...,"[55111-135-79, 55111-135-81, 55111-136-79, 551...",[True],,"[N0000175607, M0018962]",[Retinoid [EPC]],[Retinoids [CS]],[EH28UP18IF],,,


In [33]:
def drop_cols(df, null_percentage):
    '''The function drops the columns which have null percentage greater than given null percentage by the user,
    INPUTS:
        df = The dataframe from which columns need to be dropped
        null_percentage = A numerical figure provided by user which acts as a threshold for null_percenatge 
    OUTPUT
        The function returns True if all columns are dropped successfully else it returns False
    '''
    try:
        total_rows = len(df)
        for column in df.columns:
            if (df[column].isnull().sum() / total_rows)*100> null_percentage:
                df.drop(columns=column, inplace=True)
        return True
    except Exception as e:
        logging.error("Exception occurred at drop_cols", exc_info=True)
        return False
if drop_cols(temp_df, 80):
    print('Columns Dropped Successfully')
else:
    print('Unable to drop columns check logs for more info')

Columns Dropped Successfully


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
from pandas_profiling import ProfileReport

In [51]:
col_list = ['openfda.application_number',
       'openfda.brand_name', 'openfda.generic_name',
       'openfda.manufacturer_name', 'openfda.product_ndc',
       'openfda.product_type', 'openfda.route', 'openfda.substance_name',
       'openfda.rxcui', 'openfda.spl_id', 'openfda.spl_set_id',
       'openfda.package_ndc', 'openfda.is_original_packager', 'openfda.upc',
       'openfda.nui', 'openfda.pharm_class_epc', 'openfda.unii']
for column in col_list:
    col = column.split('.')[1]
    temp_df[col] = temp_df[column].astype('str').apply(lambda x: x.replace('[','').replace(']',''))
    temp_df.drop(columns = column, inplace=True)
    print(temp_df[col])


5        'ANDA201089'
16       'ANDA071583'
23       'ANDA040301'
28        'NDA020062'
29       'ANDA202099'
             ...     
12979    'ANDA088325'
12980    'ANDA040659'
12984             nan
12986    'ANDA074532'
12993     'NDA022433'
Name: application_number, Length: 2132, dtype: object
5        'DICLOFENAC SODIUM AND MISOPROSTOL'
16                                'DIAZEPAM'
23                         'WARFARIN SODIUM'
28                             'CARDIZEM CD'
29                                'ZENATANE'
                        ...                 
12979              'LIDOCAINE HYDROCHLORIDE'
12980              'MECLIZINE HYDROCHLORIDE'
12984                            'GONIOSOFT'
12986                            'CAPTOPRIL'
12993                             'BRILINTA'
Name: brand_name, Length: 2132, dtype: object
5        'DICLOFENAC SODIUM AND MISOPROSTOL'
16                                'DIAZEPAM'
23                         'WARFARIN SODIUM'
28                 'DILTIAZE

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[col] = temp_df[column].astype('str').apply(lambda x: x.replace('[','').replace(']',''))


In [53]:
temp_df.drop(columns = ['_id', 'product_type'], inplace=True)

In [55]:
temp_df.to_csv('drugs_recalled.csv', index=False)

In [46]:
report = ProfileReport(temp_df)
report.to_file(output_file= 'report_temp.html')

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=52.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [20]:
drugs_recalled_df.dtypes

_id                           object
country                       object
city                          object
address_1                     object
reason_for_recall             object
address_2                     object
product_quantity              object
code_info                     object
center_classification_date    object
distribution_pattern          object
state                         object
product_description           object
report_date                   object
classification                object
recalling_firm                object
recall_number                 object
initial_firm_notification     object
product_type                  object
event_id                      object
termination_date              object
recall_initiation_date        object
postal_code                   object
voluntary_mandated            object
status                        object
dtype: object

In [19]:
dict(drugs_recalled_df['event_id'].value_counts())

{'70452': 442,
 '80601': 390,
 '71371': 390,
 '65690': 351,
 '65707': 338,
 '65479': 299,
 '62416': 117,
 '83265': 104,
 '79619': 104,
 '73960': 104,
 '77992': 104,
 '79149': 104,
 '74057': 104,
 '65876': 91,
 '69152': 91,
 '83074': 91,
 '84546': 91,
 '81002': 91,
 '62443': 91,
 '85992': 91,
 '70148': 91,
 '82554': 91,
 '72594': 91,
 '72520': 91,
 '75763': 91,
 '80750': 78,
 '85341': 78,
 '83136': 78,
 '74466': 78,
 '65360': 78,
 '74187': 78,
 '72179': 78,
 '76472': 78,
 '75289': 78,
 '64661': 78,
 '83427': 65,
 '70656': 65,
 '66566': 65,
 '78553': 65,
 '76644': 65,
 '72455': 65,
 '76912': 65,
 '78575': 65,
 '84055': 65,
 '72241': 65,
 '83669': 52,
 '85404': 52,
 '73662': 52,
 '77840': 52,
 '84978': 52,
 '86990': 52,
 '64188': 52,
 '67723': 52,
 '73925': 52,
 '81810': 52,
 '65019': 52,
 '64879': 52,
 '71618': 52,
 '84527': 52,
 '71806': 52,
 '71989': 52,
 '64957': 52,
 '73899': 52,
 '78063': 52,
 '80118': 52,
 '86123': 52,
 '82398': 52,
 '78227': 39,
 '84263': 39,
 '71104': 39,
 '68793

In [33]:
list(drugs_recalled_df['distribution_pattern'].apply(.unique())

['Nationwide',
 'Nationwide in the USA',
 'Nationwide to compounding pharmacies and research organizations.',
 'Distributed Nationwide in the USA',
 'Nationwide and Puerto Rico',
 'All product is sold to Ace Distributors, LLC in Orlando, FL, a retail store, who sells directly to owners of gas stations, convenience stores, and smoke shops where it can be further distributed nationwide.',
 'Domestic distribution currently under Investigation; International distribution includes, but may not be limited to the following countries : Mexico, France, Brazil, Belgium, Denmark, Switzerland, Spain, Canada, Italy, Japan, Ireland, Venezuela, Oman.',
 'U.S. Nationwide',
 'nationwide, specifically:  AK, AL, AZ, CA, CO, CT, DC, FL, GA, HI, IL, IN, KS, KY, LA, MA, MD, MN, MO, MS, NC,   NE, NH, NJ, NM, NV, NY, OH, OK, OR, PA, SC, TN, TX, UT, VA, WA, WI, and WV',
 'Repacked drugs were distributed in Arizona, California, Oregon, and Washington.',
 'Product distributed to NY, TX, SC, CO, MN and NJ',
 'Nat

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)

['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']
