In [1]:
#Import necessary packages:
import pandas as pd
import csv
import matplotlib.pyplot as plt   

In [2]:
#Loading .csv file to Pandas dataframe
adfood_eff= pd.read_csv('CAERS_ASCII_2004_2017Q2.csv')

In [3]:
adfood_eff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90786 entries, 0 to 90785
Data columns (total 12 columns):
RA_Report #                        90786 non-null int64
RA_CAERS Created Date              90786 non-null object
AEC_Event Start Date               53653 non-null object
PRI_Product Role                   90786 non-null object
PRI_Reported Brand/Product Name    90786 non-null object
PRI_FDA Industry Code              90786 non-null int64
PRI_FDA Industry Name              90786 non-null object
CI_Age at Adverse Event            52926 non-null float64
CI_Age Unit                        90786 non-null object
CI_Gender                          90786 non-null object
AEC_One Row Outcomes               90786 non-null object
SYM_One Row Coded Symptoms         90781 non-null object
dtypes: float64(1), int64(2), object(9)
memory usage: 8.3+ MB


In [4]:
#Removing unwanted columns which are not useful for my prediction
adfood_eff= adfood_eff.drop(['PRI_Reported Brand/Product Name', 'RA_CAERS Created Date', 'AEC_Event Start Date', 'PRI_Product Role', 'PRI_FDA Industry Code'], axis=1)

In [5]:
#Renaming the columns
adfood_eff.columns= ['Report Number', 'Industry Name', 'Age', 'Age Unit', 'Gender', 'Outcomes', 'Symptoms']

In [6]:
#Extracting the serious and non-serious outcomes
adfood_eff['outcome']=((adfood_eff['Outcomes'].str.contains("(?:\s|^)HOSPITALIZATION(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)OTHER SERIOUS \(IMPORTANT MEDICAL EVENTS\)(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)LIFE THREATENING(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)SERIOUS INJURIES\/ ILLNESS(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)DISABILITY(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)DEATH(?:\s|$)"))|
                   (adfood_eff['Outcomes'].str.contains("(?:\s|^)CONGENITAL ANOMALY(?:\s|$)")))

In [7]:
#Converting the outcomes into binary values..60
adfood_eff['Serious']= [1 if o==True else 0 for o in adfood_eff['outcome']]

In [8]:
#Dropping the unnecessary columns
adfood_eff= adfood_eff.drop(['outcome'], axis=1)

In [9]:
#Drop NaN(missing age) values
adfood_eff=adfood_eff.dropna(subset=['Symptoms'])

In [10]:
#Age units
adfood_eff['Age Unit'].unique()

array(['Year(s)', 'Not Available', 'Month(s)', 'Week(s)', 'Day(s)',
       'Decade(s)'], dtype=object)

In [11]:
#Converting all age into years
adfood_eff.loc[adfood_eff['Age Unit']=='Month(s)', 'Age']=  adfood_eff['Age']/10
adfood_eff.loc[adfood_eff['Age Unit']=='Week(s)', 'Age']=  adfood_eff['Age']/52.1429
adfood_eff.loc[adfood_eff['Age Unit']=='Day(s)', 'Age']=  adfood_eff['Age']/365
adfood_eff.loc[adfood_eff['Age Unit']=='Decade(s)', 'Age']=  adfood_eff['Age']*10 
adfood_eff['Age Unit']= 'Year(s)'

In [12]:
#Dropping the ages greater than 110 years
adfood_eff= adfood_eff.drop(adfood_eff[adfood_eff['Age']>110].index)

In [13]:
adfood_eff['ABDOMINAL SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*DIARRHOEA,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*VOMITING,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ABDOMINAL,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PAIN,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DYSGEUSIA,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*RASH,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*NAUSEA,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*GASTROINTESTINAL,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*GASTRIC,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*MALAISE,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*LOOSE STOOLS,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BURNING SENSATION,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*FAECES,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*GASTROOESOPHAGEAL,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*GASTROENTERITIS,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*POTASSIUM,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*STOMACH,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*GASTRITIS,*(?:\s|$)")))

adfood_eff['CHEST/HEART RELATED SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*CHOKING,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BLOOD BILIRUBIN,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BLOOD PRESSURE,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*CHEST,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DYSPNEA,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*LOSS OF CONSCIOUSNESS,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HEART RATE,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PHARYNGEAL OEDEMA,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPERTENSION,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SYNCOPE,*(?:\s|$)"))|
                                   (adfood_eff['Symptoms'].str.contains("(?:\s|^)*MYOCARDIAL INFARCTION,*(?:\s|$)"))|
                                (adfood_eff['Symptoms'].str.contains("(?:\s|^)*FALL,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DYSPHAGIA,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ATRIAL,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPOTENSION,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*TACHYCARDIA,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PNEUMONIA,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ARRHYTHMIA,*(?:\s|$)"))|
                                    (adfood_eff['Symptoms'].str.contains("(?:\s|^)*CARDIAC,*(?:\s|$)")))
                                  

adfood_eff['COUGH/HEADACHE/DIZZINESS/NAUSEA']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*COUGH,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*VOMITING,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DIZZINESS,*(?:\s|$)"))|
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HEADACHE,*(?:\s|$)"))|                                                   
                                                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*NAUSEA,*(?:\s|$)"))|
                                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*WHEEZING,*(?:\s|$)"))|
                                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*FATIGUE,*(?:\s|$)"))|
                                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*RETCHING,*(?:\s|$)"))|
                                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*LETHARGY,*(?:\s|$)"))|
                                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*MAGRAINE,*(?:\s|$)")))

adfood_eff['CANCER']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CANCER,*(?:\s|$)"))

adfood_eff['IMMUNE SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*COELIAC DISEASE,*(?:\s|$)"))|
                               (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPERSENSITIVITY,*(?:\s|$)"))|
                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*TENDERNESS,*(?:\s|$)")))

adfood_eff['HAIR LOSS SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*ALOPECIA,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HAIR,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*TRICHORRHEXIS,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPOTRICHOSIS,*(?:\s|$)")))

adfood_eff['SKIN/BODY ALLERGIES']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*RASH,*(?:\s|$)"))|
                                   (adfood_eff['Symptoms'].str.contains("(?:\s|^)*URTICARIA,*(?:\s|$)"))|
                                   (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PRURITUS,*(?:\s|$)"))|
                                   (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SWELLING,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ERYTHEMA,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BODY TEMPERATURE,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BLISTER,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*FLUSHING,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SKIN,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HOT,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SWOLLEN,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*OEDEMA PERIPHERAL,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*INFECTION,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*INFLAMMATION,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ALLERGY,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPERAEMIA,*(?:\s|$)")))    

adfood_eff['ASTHENIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*ASTHENIA,*(?:\s|$)"))
adfood_eff['FOREIGN BODY TRAUMA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*FOREIGN BODY,*(?:\s|$)"))
adfood_eff['DEHYDRATION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*DEHYDRATION,*(?:\s|$)"))
adfood_eff['DYSPEPSIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*DYSPEPSIA,*(?:\s|$)"))
adfood_eff['PARAESTHESIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*PARAESTHESIA,*(?:\s|$)"))
adfood_eff['PALPITATIONS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*PALPITATIONS,*(?:\s|$)"))
adfood_eff['HYPERHIDROSIS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPERHIDROSIS,*(?:\s|$)"))
adfood_eff['CONSTIPATION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CONSTIPATION,*(?:\s|$)"))
adfood_eff['THROAT SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*THROAT,*(?:\s|$)"))|
                            (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PHARYNGITIS,*(?:\s|$)"))|
                            (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DYSPHONIA,*(?:\s|$)"))|
                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SPEECH,*(?:\s|$)"))|
                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ORAL,*(?:\s|$)")))
    
adfood_eff['FLATULENCE']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*FLATULENCE,*(?:\s|$)"))
adfood_eff['CHILLS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CHILLS,*(?:\s|$)"))

adfood_eff['INSOMNIA']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*INSOMNIA,*(?:\s|$)"))|
                        (adfood_eff['Symptoms'].str.contains("(?:\s|^)*SOMNOLENCE,*(?:\s|$)")))

adfood_eff['TREMOR']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*TREMOR,*(?:\s|$)"))
adfood_eff['HYPOAESTHESIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*HYPOAESTHESIA,*(?:\s|$)"))
adfood_eff['CONVULSION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CONVULSION,*(?:\s|$)"))
adfood_eff['JAUNDICE']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*JAUNDICE,*(?:\s|$)"))
adfood_eff['WEIGHT ALLERGIES']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*WEIGHT,*(?:\s|$)"))
adfood_eff['HAEMATOCHEZIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*HAEMATOCHEZIA,*(?:\s|$)"))
adfood_eff['PYREXIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*PYREXIA,*(?:\s|$)"))
adfood_eff['APPETITE SYMPTOMS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*APPETITE,*(?:\s|$)"))
adfood_eff['AMINOTRANSFERASE']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*AMINOTRANSFERASE,*(?:\s|$)"))
adfood_eff['ABNORMAL']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*ABNORMAL,*(?:\s|$)"))|
                        (adfood_eff['Symptoms'].str.contains("(?:\s|^)*FEAR,*(?:\s|$)"))|
                        (adfood_eff['Symptoms'].str.contains("(?:\s|^)*PANIC,*(?:\s|$)"))|
                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*IRRITABILITY,*(?:\s|$)"))|
                       (adfood_eff['Symptoms'].str.contains("(?:\s|^)*COLD SWEAT,*(?:\s|$)")))
                        

In [14]:
adfood_eff['HEPATIC ENZYME']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*HEPATIC ENZYME,*(?:\s|$)"))

adfood_eff['HAEMORRHAGE']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*HAEMORRHAGE,*(?:\s|$)"))|
                           (adfood_eff['Symptoms'].str.contains("(?:\s|^)*EPISTAXIS,*(?:\s|$)")))

adfood_eff['CEREBROVASCULAR SYSTOMS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CEREBROVASCULAR,*(?:\s|$)"))
adfood_eff['BACK BONES/MUSCLES']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*BACK,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*MYALGIA,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*ARTHRALGIA,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*MUSCULAR,*(?:\s|$)")))

adfood_eff['FOOD POISONING']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*FOOD POISONING,*(?:\s|$)"))
adfood_eff['VISION/EYE']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*VISION,*(?:\s|$)"))|
                         (adfood_eff['Symptoms'].str.contains("(?:\s|^)*EYE,*(?:\s|$)"))|
                         (adfood_eff['Symptoms'].str.contains("(?:\s|^)*VISUAL,*(?:\s|$)")))

adfood_eff['DEATH']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*DEATH,*(?:\s|$)"))|
                     (adfood_eff['Symptoms'].str.contains("(?:\s|^)*AGGRAVATED,*(?:\s|$)")))

adfood_eff['URINARY INFECTIONS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*CHROMATURIA,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*URINE,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*URINARY,*(?:\s|$)")))
    
adfood_eff['CONFUSIONAL STATE']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*CONFUSIONAL,*(?:\s|$)"))
adfood_eff['DIABETES']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*BLOOD GLUCOSE,*(?:\s|$)"))
adfood_eff['HOSPITALISATION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*HOSPITALISATION,*(?:\s|$)"))
adfood_eff['ANAPHYLACTIC REACTION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*ANAPHYLACTIC,*(?:\s|$)"))  



adfood_eff['BLOOD RELATED SYSTOMS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*BLOOD,*(?:\s|$)"))
adfood_eff['EMOTIONAL SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*EMOTIONAL,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*DEPRESSION,*(?:\s|$)")))
                                 

adfood_eff['INFLUENZA LIKE ILLNESS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*INFLUENZA,*(?:\s|$)"))
adfood_eff['NERVOUSNESS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*NERVOUSNESS,*(?:\s|$)"))
adfood_eff['GAIT DISTURBANCE']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*GAIT,*(?:\s|$)"))

adfood_eff['LIVER INFECTIONS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*LIVER,*(?:\s|$)"))|
                                (adfood_eff['Symptoms'].str.contains("(?:\s|^)*HEPATITIS,*(?:\s|$)")))
                                
adfood_eff['DISCOMFORT']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*DISCOMFORT,*(?:\s|$)"))|
                          (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BALANCE,*(?:\s|$)")))

adfood_eff['KIDNEY SYMPTOMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*RENAL,*(?:\s|$)"))|
                               (adfood_eff['Symptoms'].str.contains("(?:\s|^)*NEPHROLITHIASIS,*(?:\s|$)"))|
                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*KIDNEY,*(?:\s|$)"))|
                              (adfood_eff['Symptoms'].str.contains("(?:\s|^)*RHABDOMYOLYSIS,*(?:\s|$)")))

adfood_eff['DENTAL SYMPTOMS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*TOOTH,*(?:\s|$)"))
adfood_eff['SKIN/BODY INJURY']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*LACERATION,*(?:\s|$)"))|
                                (adfood_eff['Symptoms'].str.contains("(?:\s|^)*INJURY,*(?:\s|$)")))

adfood_eff['BREATHING PROBLEMS']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*ASTHMA,*(?:\s|$)"))|
                                  (adfood_eff['Symptoms'].str.contains("(?:\s|^)*BREATH,*(?:\s|$)"))|
                                 (adfood_eff['Symptoms'].str.contains("(?:\s|^)*WHEEZING,*(?:\s|$)")))


adfood_eff['EXAMINATION']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*PHYSICAL,*(?:\s|$)"))
adfood_eff['MEMORY INJURY']=((adfood_eff['Symptoms'].str.contains("(?:\s|^)*MEMORY,*(?:\s|$)"))|
                             (adfood_eff['Symptoms'].str.contains("(?:\s|^)*AMNESIA,*(?:\s|$)")))

adfood_eff['BLOOD CLOTTING']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*THROMBOSIS,*(?:\s|$)"))

adfood_eff['DAILING LIVING']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*ACTIVITIES OF DAILY LIVING,*(?:\s|$)"))
adfood_eff['PANCREATITIS']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*PANCREATITIS,*(?:\s|$)"))
adfood_eff['SALMONELLA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*SALMONELLA,*(?:\s|$)"))
adfood_eff['ABASIA']=(adfood_eff['Symptoms'].str.contains("(?:\s|^)*ABASIA,*(?:\s|$)"))


In [15]:
symptoms_list= ['DIARRHOEA','VOMITING','ABDOMINAL','PAIN','DYSGEUSIA','RASH','NAUSEA','GASTROINTESTINAL','GASTRIC','MALAISE'
                ,'LOOSE STOOLS','BURNING SENSATION','FAECES','GASTROOESOPHAGEAL','GASTROENTERITIS','POTASSIUM'
                ,'STOMACH','GASTRITIS','CHOKING','BLOOD BILIRUBIN','BLOOD PRESSURE','CHEST','DYSPNEA','LOSS OF CONSCIOUSNESS'
                ,'HEART RATE','PHARYNGEAL OEDEMA','HYPERTENSION','SYNCOPE','MYOCARDIAL INFARCTION','FALL','DYSPHAGIA','ATRIAL'
                ,'HYPOTENSION','TACHYCARDIA','PNEUMONIA','ARRHYTHMIA','CARDIAC','COUGH','VOMITING','DIZZINESS','HEADACHE'
                ,'NAUSEA','WHEEZING','FATIGUE','RETCHING','LETHARGY','MAGRAINE','CANCER','COELIAC DISEASE','HYPERSENSITIVITY'
                ,'TENDERNESS','ALOPECIA','HAIR','TRICHORRHEXIS','HYPOTRICHOSIS','RASH','URTICARIA','PRURITUS','SWELLING'
                ,'ERYTHEMA','BODY TEMPERATURE','BLISTER','FLUSHING','SKIN','HOT','SWOLLEN','OEDEMA PERIPHERAL','INFECTION'
                ,'INFLAMMATION','ALLERGY','HYPERAEMIA','ASTHENIA','FOREIGN BODY','DEHYDRATION','DYSPEPSIA','PARAESTHESIA'
                ,'PALPITATIONS','HYPERHIDROSIS','CONSTIPATION','THROAT','PHARYNGITIS','DYSPHONIA','SPEECH','ORAL','FLATULENCE'
                ,'CHILLS','INSOMNIA','SOMNOLENCE','TREMOR','HYPOAESTHESIA','CONVULSION','JAUNDICE','WEIGHT','HAEMATOCHEZIA'
                ,'PYREXIA','APPETITE','AMINOTRANSFERASE','ABNORMAL','FEAR','PANIC','IRRITABILITY','COLD SWEAT','HEPATIC ENZYME'
                ,'HAEMORRHAGE','EPISTAXIS','CEREBROVASCULAR','BACK','MYALGIA','ARTHRALGIA','MUSCULAR','FOOD POISONING','VISION'
                ,'EYE','VISUAL','DEATH','AGGRAVATED','CHROMATURIA','URINE','URINARY','CONFUSIONAL','BLOOD GLUCOSE'
                ,'HOSPITALISATION','ANAPHYLACTIC','BLOOD','EMOTIONAL','DEPRESSION','INFLUENZA','NERVOUSNESS','GAIT','LIVER'
                ,'HEPATITIS','DISCOMFORT','BALANCE','RENAL','NEPHROLITHIASIS','KIDNEY','RHABDOMYOLYSIS','TOOTH','LACERATION'
                ,'INJURY','ASTHMA','BREATH','WHEEZING','PHYSICAL','MEMORY','AMNESIA','THROMBOSIS','ACTIVITIES OF DAILY LIVING'
                ,'PANCREATITIS','SALMONELLA','ABASIA']

In [16]:
def stringChecker(stringList, word):
    truth_list = [string in word for string in stringList]
    return max(pd.Series(truth_list))

In [17]:
adfood_eff['symptoms_list']= adfood_eff['Symptoms'].apply(lambda x: stringChecker(symptoms_list, x))

In [18]:
adfood_eff['Other']= [False if symps==True else True for symps in adfood_eff['symptoms_list']]

In [19]:
#Dropping the unnecessary columns
adfood_eff= adfood_eff.drop(['symptoms_list'], axis=1)

In [20]:
adfood_eff.head()

Unnamed: 0,Report Number,Industry Name,Age,Age Unit,Gender,Outcomes,Symptoms,Serious,ABDOMINAL SYMPTOMS,CHEST/HEART RELATED SYMPTOMS,...,SKIN/BODY INJURY,BREATHING PROBLEMS,EXAMINATION,MEMORY INJURY,BLOOD CLOTTING,DAILING LIVING,PANCREATITIS,SALMONELLA,ABASIA,Other
0,65325,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,"VISITED AN ER, VISITED A HEALTH CARE PROVIDER,...","SWELLING FACE, RASH, WHEEZING, COUGH, HOSPITAL...",1,True,False,...,False,True,False,False,False,False,False,False,False,False
1,65325,Bakery Prod/Dough/Mix/Icing,2.0,Year(s),Female,"VISITED AN ER, VISITED A HEALTH CARE PROVIDER,...","SWELLING FACE, WHEEZING, COUGH, RASH, HOSPITAL...",1,True,False,...,False,True,False,False,False,False,False,False,False,False
2,65333,Ice Cream Prod,,Year(s),Female,VISITED AN ER,"NAUSEA, DYSGEUSIA, DIARRHOEA",0,True,False,...,False,False,False,False,False,False,False,False,False,False
3,65335,Baby Food Prod,0.3,Year(s),Not Available,NON-SERIOUS INJURIES/ ILLNESS,"GASTROINTESTINAL DISORDER, VOMITING",0,True,False,...,False,False,False,False,False,False,False,False,False,False
4,65336,Baby Food Prod,,Year(s),Not Available,VISITED A HEALTH CARE PROVIDER,"GASTROINTESTINAL DISORDER, PHYSICAL EXAMINATION",0,True,False,...,False,False,True,False,False,False,False,False,False,False


In [21]:
#Creating dummy variables
cat_vars=['Industry Name','Gender','Age Unit']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(adfood_eff[var], prefix=var)
    adfood_eff1=adfood_eff.join(cat_list)
    adfood_eff=adfood_eff1

In [22]:
cat_vars=['Industry Name','Gender','Age Unit']
adfood_eff_vars=adfood_eff.columns.values.tolist()
to_keep=[i for i in adfood_eff_vars if i not in cat_vars]

In [23]:
#Final values and coulmns
adfood_eff_final=adfood_eff[to_keep]
adfood_eff_final.columns.values

array(['Report Number', 'Age', 'Outcomes', 'Symptoms', 'Serious',
       'ABDOMINAL SYMPTOMS', 'CHEST/HEART RELATED SYMPTOMS',
       'COUGH/HEADACHE/DIZZINESS/NAUSEA', 'CANCER', 'IMMUNE SYMPTOMS',
       'HAIR LOSS SYMPTOMS', 'SKIN/BODY ALLERGIES', 'ASTHENIA',
       'FOREIGN BODY TRAUMA', 'DEHYDRATION', 'DYSPEPSIA', 'PARAESTHESIA',
       'PALPITATIONS', 'HYPERHIDROSIS', 'CONSTIPATION', 'THROAT SYMPTOMS',
       'FLATULENCE', 'CHILLS', 'INSOMNIA', 'TREMOR', 'HYPOAESTHESIA',
       'CONVULSION', 'JAUNDICE', 'WEIGHT ALLERGIES', 'HAEMATOCHEZIA',
       'PYREXIA', 'APPETITE SYMPTOMS', 'AMINOTRANSFERASE', 'ABNORMAL',
       'HEPATIC ENZYME', 'HAEMORRHAGE', 'CEREBROVASCULAR SYSTOMS',
       'BACK BONES/MUSCLES', 'FOOD POISONING', 'VISION/EYE', 'DEATH',
       'URINARY INFECTIONS', 'CONFUSIONAL STATE', 'DIABETES',
       'HOSPITALISATION', 'ANAPHYLACTIC REACTION',
       'BLOOD RELATED SYSTOMS', 'EMOTIONAL SYMPTOMS',
       'INFLUENZA LIKE ILLNESS', 'NERVOUSNESS', 'GAIT DISTURBANCE',
      

In [24]:
from sklearn.model_selection import train_test_split
adfood_eff_train, adfood_eff_test = train_test_split(adfood_eff_final, test_size=0.3, random_state=42)

In [48]:
adfood_eff_Finaltest= adfood_eff_test.drop(['Serious', 'Symptoms', 'Outcomes'], axis=1)

In [25]:
X_training = adfood_eff_train.drop(['Serious', 'Symptoms', 'Outcomes'], axis=1)
y_training = adfood_eff_train['Serious']

### Recursive feature elimination to select important features

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import RFE
imp=Imputer(missing_values='NaN', strategy='mean', axis=0)
model= LogisticRegression()
rfe = RFE(model, 50)
steps= [('imputation', imp), ('rfe', rfe)]
pipeline= Pipeline(steps)

In [27]:
pipeline.fit(X_training, y_training)
print('Selected features: %s' % list(X_training.columns[rfe.support_]))

Selected features: ['Report Number', 'Age', 'ABDOMINAL SYMPTOMS', 'CHEST/HEART RELATED SYMPTOMS', 'COUGH/HEADACHE/DIZZINESS/NAUSEA', 'CANCER', 'SKIN/BODY ALLERGIES', 'FOREIGN BODY TRAUMA', 'CONSTIPATION', 'CHILLS', 'INSOMNIA', 'CONVULSION', 'JAUNDICE', 'WEIGHT ALLERGIES', 'AMINOTRANSFERASE', 'ABNORMAL', 'HEPATIC ENZYME', 'HAEMORRHAGE', 'CEREBROVASCULAR SYSTOMS', 'BACK BONES/MUSCLES', 'FOOD POISONING', 'DEATH', 'URINARY INFECTIONS', 'BLOOD RELATED SYSTOMS', 'EMOTIONAL SYMPTOMS', 'LIVER INFECTIONS', 'KIDNEY SYMPTOMS', 'DENTAL SYMPTOMS', 'Other', 'Industry Name_Baby Food Prod', 'Industry Name_Bakery Prod/Dough/Mix/Icing', 'Industry Name_Candy W/O Choc/Special/Chew Gum', 'Industry Name_Cereal Prep/Breakfast Food', 'Industry Name_Choc/Cocoa Prod', 'Industry Name_Coffee/Tea', 'Industry Name_Cosmetics', 'Industry Name_Fishery/Seafood Prod', 'Industry Name_Fruit/Fruit Prod', 'Industry Name_Ice Cream Prod', 'Industry Name_Milk/Butter/Dried Milk Prod', 'Industry Name_Mult Food Dinner/Grav/Sauce/

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss



X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, test_size=0.2, random_state=2)

# check classification scores of logistic regression
logreg = LogisticRegression()
imp=Imputer(missing_values='NaN', strategy='mean', axis=0)
steps= [('imputation', imp), ('Logistic Regression', logreg)]
pipeline= Pipeline(steps)
pipeline.fit(X_train, y_train)
y_pred= pipeline.predict(X_test)

print('Train/Test split results:')
print(logreg.__class__.__name__+" accuracy is %2.3f" % accuracy_score(y_test, y_pred))

Train/Test split results:
LogisticRegression accuracy is 0.574


In [31]:
X_training= imp.fit(X_training)


TypeError: float() argument must be a string or a number, not 'Imputer'

In [29]:
X_training["Age"].fillna(X_training["Age"].median(skipna=True), inplace=True)

In [30]:
# 10-fold cross-validation logistic regression
logreg = LogisticRegression()

scores_accuracy = cross_val_score(logreg, X_training, y_training, cv=10, scoring='accuracy')
scores_log_loss = cross_val_score(logreg, X_training, y_training, cv=10, scoring='neg_log_loss')
scores_auc = cross_val_score(logreg, X_training, y_training, cv=10, scoring='roc_auc')
print('K-fold cross-validation results:')
print(logreg.__class__.__name__+" average accuracy is %2.3f" % scores_accuracy.mean())
print(logreg.__class__.__name__+" average log_loss is %2.3f" % -scores_log_loss.mean())
print(logreg.__class__.__name__+" average auc is %2.3f" % scores_auc.mean())


K-fold cross-validation results:
LogisticRegression average accuracy is 0.570
LogisticRegression average log_loss is 0.670
LogisticRegression average auc is 0.667


In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import GridSearchCV



C = np.arange(1e-05, 5.5, 0.1)
scoring = {'Accuracy': 'accuracy', 'AUC': 'roc_auc', 'Log_loss': 'neg_log_loss'}
log_reg = LogisticRegression()


std_scale = StandardScaler(with_mean=False, with_std=False)
#std_scale = StandardScaler()



n_folds=5
n_repeats=5

rskfold = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=2)


log_clf_pipe = Pipeline(steps=[('scale',std_scale), ('clf',log_reg)])

log_clf = GridSearchCV(estimator=log_clf_pipe, cv=rskfold,
              scoring=scoring, return_train_score=True,
              param_grid=dict(clf__C=C), refit='Accuracy')

log_clf.fit(X_training, y_training)
results = log_clf.cv_results_
print('='*20)
print("best params: " + str(log_clf.best_estimator_))
print("best params: " + str(log_clf.best_params_))
print('best score:', log_clf.best_score_)
print('='*20)


best params: Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=False, with_std=False)), ('clf', LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
best params: {'clf__C': 1e-05}
best score: 0.5697513377400063


In [50]:
adfood_eff_Finaltest["Age"].fillna(adfood_eff_Finaltest["Age"].median(skipna=True), inplace=True)

In [51]:

adfood_eff_Finaltest['Serious'] = log_clf.predict(adfood_eff_Finaltest)

Final_report = adfood_eff_Finaltest[['Report Number','Serious']]

Final_report.to_csv("Final_report.csv", index=False)

Final_report.tail(25)

Unnamed: 0,Report Number,Serious
51386,174758,1
52866,176105,1
78695,199561,1
5792,82698,1
75077,196689,1
37177,154918,1
31843,145578,1
17587,113363,1
89557,209439,1
4578,78571,1


In [52]:
Final_report

Unnamed: 0,Report Number,Serious
63301,185930,1
40521,160659,1
38309,156751,1
63859,186429,1
44051,166529,1
43133,165022,1
81576,201710,1
9408,94214,1
16640,111602,1
85840,205694,1
