In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Function that calculates the accuracy of the model
def get_accuracy(y_pred, y_test):
    correctly_classified = 0
    for count in range (np.size(y_pred)):
        if y_test[count] == y_pred[count]:
            correctly_classified = correctly_classified + 1
    acc = (correctly_classified / len(y_pred)) * 100
    print("Accuracy on testset = {:.2f}".format(acc))
    return acc

In [3]:
df = pd.read_csv("food-allergy-analysis-Zenodo.csv")

In [4]:
pd.set_option('display.max_columns', None) # To display all kinds of allergies that are present in the data set
df.head()

Unnamed: 0,SUBJECT_ID,BIRTH_YEAR,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,AGE_START_YEARS,AGE_END_YEARS,SHELLFISH_ALG_START,SHELLFISH_ALG_END,FISH_ALG_START,FISH_ALG_END,MILK_ALG_START,MILK_ALG_END,SOY_ALG_START,SOY_ALG_END,EGG_ALG_START,EGG_ALG_END,WHEAT_ALG_START,WHEAT_ALG_END,PEANUT_ALG_START,PEANUT_ALG_END,SESAME_ALG_START,SESAME_ALG_END,TREENUT_ALG_START,TREENUT_ALG_END,WALNUT_ALG_START,WALNUT_ALG_END,PECAN_ALG_START,PECAN_ALG_END,PISTACH_ALG_START,PISTACH_ALG_END,ALMOND_ALG_START,ALMOND_ALG_END,BRAZIL_ALG_START,BRAZIL_ALG_END,HAZELNUT_ALG_START,HAZELNUT_ALG_END,CASHEW_ALG_START,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,NUM_ASTHMARX
0,1,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P1 - Medicaid,False,0.093087,3.164956,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2,1994,S1 - Female,R0 - White,E0 - Non-Hispanic,P0 - Non-Medicaid,False,12.232717,18.880219,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.262834,18.880219,2.0
2,3,2006,S0 - Male,R0 - White,E1 - Hispanic,P0 - Non-Medicaid,True,0.010951,6.726899,,,,,1.002053,1.002053,,,,,,,,,,,,,,,,,,,,,,,,,,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426,4.0
3,4,2004,S0 - Male,R4 - Unknown,E1 - Hispanic,P0 - Non-Medicaid,False,2.398357,9.111567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,5,2006,S1 - Female,R1 - Black,E0 - Non-Hispanic,P0 - Non-Medicaid,False,0.013689,6.193018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333200 entries, 0 to 333199
Data columns (total 50 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   SUBJECT_ID               333200 non-null  int64  
 1   BIRTH_YEAR               333200 non-null  int64  
 2   GENDER_FACTOR            333200 non-null  object 
 3   RACE_FACTOR              333200 non-null  object 
 4   ETHNICITY_FACTOR         333200 non-null  object 
 5   PAYER_FACTOR             333200 non-null  object 
 6   ATOPIC_MARCH_COHORT      333200 non-null  bool   
 7   AGE_START_YEARS          333200 non-null  float64
 8   AGE_END_YEARS            333200 non-null  float64
 9   SHELLFISH_ALG_START      5246 non-null    float64
 10  SHELLFISH_ALG_END        1051 non-null    float64
 11  FISH_ALG_START           1796 non-null    float64
 12  FISH_ALG_END             527 non-null     float64
 13  MILK_ALG_START           7289 non-null    float64
 14  MILK

## Preparing the data and cleaning it

In [6]:
# Coding the genders in numbers : 0 if female ; 1 if male.
df["GENDER_FACTOR"].replace({"S1 - Female": "0", "S0 - Male": "1"}, inplace=True)
# Since the replace() method uses string values, to_numeric() method is used to convert "0" and "1" to int
df['GENDER_FACTOR'] = pd.to_numeric(df['GENDER_FACTOR'])

In [7]:
# Checking how many unique races exist in the dataframe
df["RACE_FACTOR"].unique()

array(['R1 - Black', 'R0 - White', 'R4 - Unknown',
       'R2 - Asian or Pacific Islander', 'R3 - Other'], dtype=object)

In [8]:
# Coding the races in numbers : 0 if white ; 1 if black ; 2 if Asian or Pacific Islander ; 3 if Other ; 4 if Unknown
df["RACE_FACTOR"].replace({"R0 - White": "0", "R1 - Black": "1", "R2 - Asian or Pacific Islander": "2", "R3 - Other": "3", "R4 - Unknown" : "4"}, inplace=True)
# Converting the codes into int
df['RACE_FACTOR'] = pd.to_numeric(df['RACE_FACTOR'])

In [9]:
# Coding the ethnicities in numbers : 0 if Non-Hispanic ; 1 if Hispanic.
df["ETHNICITY_FACTOR"].replace({"E0 - Non-Hispanic": "0", "E1 - Hispanic": "1"}, inplace=True)
# Converting the codes into int
df['ETHNICITY_FACTOR'] = pd.to_numeric(df['ETHNICITY_FACTOR'])

In [10]:
# Coding the ethnicities in numbers : 0 if Non-Medicaid ; 1 if Medicaid.
df["PAYER_FACTOR"].replace({"P0 - Non-Medicaid": "0", "P1 - Medicaid": "1"}, inplace=True)
# Converting the codes into int
df['PAYER_FACTOR'] = pd.to_numeric(df['PAYER_FACTOR'])

In [11]:
# Converting the boolean values of atopic march into int
df["ATOPIC_MARCH_COHORT"] = df["ATOPIC_MARCH_COHORT"].astype(int)

In [12]:
# Dropping the subject id column
df.drop('SUBJECT_ID', inplace=True, axis=1)

In [13]:
# Dropping the birh year column (it decreases the accuracy to 50% when used as a feature)
df.drop('BIRTH_YEAR', inplace=True, axis=1)

In [14]:
# Dropping the age start/end column, as we are predicting the possibility of having an allergy, and not the age of start/end 
# of the allgergies
df.drop("AGE_START_YEARS", inplace=True, axis=1)
df.drop("AGE_END_YEARS", inplace=True, axis=1)

In [15]:
# Dropping the number of asthma's infections column
df.drop("NUM_ASTHMARX", inplace=True, axis=1)

In [16]:
# Checking the new values/types of features of the dataframe
df.head()

Unnamed: 0,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,SHELLFISH_ALG_START,SHELLFISH_ALG_END,FISH_ALG_START,FISH_ALG_END,MILK_ALG_START,MILK_ALG_END,SOY_ALG_START,SOY_ALG_END,EGG_ALG_START,EGG_ALG_END,WHEAT_ALG_START,WHEAT_ALG_END,PEANUT_ALG_START,PEANUT_ALG_END,SESAME_ALG_START,SESAME_ALG_END,TREENUT_ALG_START,TREENUT_ALG_END,WALNUT_ALG_START,WALNUT_ALG_END,PECAN_ALG_START,PECAN_ALG_END,PISTACH_ALG_START,PISTACH_ALG_END,ALMOND_ALG_START,ALMOND_ALG_END,BRAZIL_ALG_START,BRAZIL_ALG_END,HAZELNUT_ALG_START,HAZELNUT_ALG_END,CASHEW_ALG_START,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX
0,0,1,0,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.262834,18.880219
2,1,0,1,0,1,,,,,1.002053,1.002053,,,,,,,,,,,,,,,,,,,,,,,,,,,4.884326,,3.917864,6.157426,5.127995,,1.404517,6.157426
3,1,4,1,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0,1,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333200 entries, 0 to 333199
Data columns (total 45 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   GENDER_FACTOR            333200 non-null  int64  
 1   RACE_FACTOR              333200 non-null  int64  
 2   ETHNICITY_FACTOR         333200 non-null  int64  
 3   PAYER_FACTOR             333200 non-null  int64  
 4   ATOPIC_MARCH_COHORT      333200 non-null  int32  
 5   SHELLFISH_ALG_START      5246 non-null    float64
 6   SHELLFISH_ALG_END        1051 non-null    float64
 7   FISH_ALG_START           1796 non-null    float64
 8   FISH_ALG_END             527 non-null     float64
 9   MILK_ALG_START           7289 non-null    float64
 10  MILK_ALG_END             4580 non-null    float64
 11  SOY_ALG_START            2419 non-null    float64
 12  SOY_ALG_END              1431 non-null    float64
 13  EGG_ALG_START            6065 non-null    float64
 14  EGG_

In [18]:
# As NaN represents the absence of an allergy in an individual, it can be replaced with 0 for later usage.
df = df.replace(to_replace = np.nan, value =0)

In [19]:
df.head()

Unnamed: 0,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,SHELLFISH_ALG_START,SHELLFISH_ALG_END,FISH_ALG_START,FISH_ALG_END,MILK_ALG_START,MILK_ALG_END,SOY_ALG_START,SOY_ALG_END,EGG_ALG_START,EGG_ALG_END,WHEAT_ALG_START,WHEAT_ALG_END,PEANUT_ALG_START,PEANUT_ALG_END,SESAME_ALG_START,SESAME_ALG_END,TREENUT_ALG_START,TREENUT_ALG_END,WALNUT_ALG_START,WALNUT_ALG_END,PECAN_ALG_START,PECAN_ALG_END,PISTACH_ALG_START,PISTACH_ALG_END,ALMOND_ALG_START,ALMOND_ALG_END,BRAZIL_ALG_START,BRAZIL_ALG_END,HAZELNUT_ALG_START,HAZELNUT_ALG_END,CASHEW_ALG_START,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX
0,0,1,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.262834,18.880219
2,1,0,1,0,1,0.0,0.0,0.0,0.0,1.002053,1.002053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.884326,0.0,3.917864,6.157426,5.127995,0.0,1.404517,6.157426
3,1,4,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# z will contain all the allergies
z = pd.DataFrame(df.iloc[:, 5:].values)
# using any() method to check whether any value in each row is not 0 and returns True for that. (absense of 0 => the individual
# had an allergy at some age.)
df["Outcome"] = z.any(axis = 1)
# Converting the boolean results into int (0 or 1)
df["Outcome"] = df["Outcome"].astype(int)

In [21]:
# Checking the outcome ...
df.head(10)

Unnamed: 0,GENDER_FACTOR,RACE_FACTOR,ETHNICITY_FACTOR,PAYER_FACTOR,ATOPIC_MARCH_COHORT,SHELLFISH_ALG_START,SHELLFISH_ALG_END,FISH_ALG_START,FISH_ALG_END,MILK_ALG_START,MILK_ALG_END,SOY_ALG_START,SOY_ALG_END,EGG_ALG_START,EGG_ALG_END,WHEAT_ALG_START,WHEAT_ALG_END,PEANUT_ALG_START,PEANUT_ALG_END,SESAME_ALG_START,SESAME_ALG_END,TREENUT_ALG_START,TREENUT_ALG_END,WALNUT_ALG_START,WALNUT_ALG_END,PECAN_ALG_START,PECAN_ALG_END,PISTACH_ALG_START,PISTACH_ALG_END,ALMOND_ALG_START,ALMOND_ALG_END,BRAZIL_ALG_START,BRAZIL_ALG_END,HAZELNUT_ALG_START,HAZELNUT_ALG_END,CASHEW_ALG_START,CASHEW_ALG_END,ATOPIC_DERM_START,ATOPIC_DERM_END,ALLERGIC_RHINITIS_START,ALLERGIC_RHINITIS_END,ASTHMA_START,ASTHMA_END,FIRST_ASTHMARX,LAST_ASTHMARX,Outcome
0,0,1,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.262834,18.880219,1
2,1,0,1,0,1,0.0,0.0,0.0,0.0,1.002053,1.002053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.884326,0.0,3.917864,6.157426,5.127995,0.0,1.404517,6.157426,1
3,1,4,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,1,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.221081,1.221081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.015743,1.015743,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0,1,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Using the Logistic Regression prediciton Model

In [22]:
x = df.iloc[:,:5].values
y = df.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
sk_model = LogisticRegression()

In [25]:
sk_model.fit(x_train, y_train)

LogisticRegression()

In [26]:
y_pred = sk_model.predict(x_test)

In [27]:
get_accuracy(y_pred, y_test)

Accuracy on testset = 54.25


54.252701080432175