<a href="https://colab.research.google.com/github/SergioManuelJob/FluShot-Learning/blob/main/FluShot_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Driven Data Challenge done for Studying**
DrivenData. (2020). Flu Shot Learning: Predict H1N1 and Seasonal Flu Vaccines. Retrieved 11/16/2023 from https://www.drivendata.org/competitions/66/flu-shot-learning

In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Analyzing data
data = pd.read_csv("/content/drive/MyDrive/Retos-De-Datos/FluShot-Learn/training_set_features.csv")
print(data.shape)
data.head(10)

In [None]:
# Some camps, such as employment_occupation, employment_industry and hhs_geo_region are filled in incorrectly, then as they have no use I'm going to delete them
data = data.drop(['employment_occupation', 'employment_industry', 'hhs_geo_region'], axis = 1)
# This shows that the 3 columns were deleted
print(data.shape)

In [None]:
data.head(10)

In [None]:
# Now we look at the targets
data_targets = pd.read_csv('/content/drive/MyDrive/Retos-De-Datos/FluShot-Learn/training_set_labels.csv')
data_targets.head()

In [None]:
# We extract the rows that have any NaN value
null_rows = data[data.applymap(lambda x: x is np.nan).any(axis=1)]
# 4827 rows from the 26707 rows have a NaN value
print(null_rows.shape)
# And when we look at the datasets which contains the NaN rows most of the NaN values are: education, income poverty, marital_status, rent or own and employment_status, with a few on other columns
null_rows.isnull().head(30)

Strategy 1: Discard

In [62]:
# Now the strategy I'm gonna do is: the rows (persons) who have a lot of NaN's will be deleted from the dataset
num_of_nuls = 5
indexes_of_nuls = data[(data.isnull().sum(axis=1) >= num_of_nuls)].index
# This shows how many rows have 5 NaN's or more, 1367 rows meet that condition and I'm gonna delete
len(indexes_of_nuls)


1367

In [68]:
for i in range (len(indexes_of_nuls)):
  data = data.drop(indexes_of_nuls[i])

In [73]:
print(data.shape)
# 26707 - 1367 = 25340 rows left. Now we should fill our values left on NaN

(25340, 33)


In [None]:
data.fillna(method='ffill', inplace = True)
# We replaced the NaN values using ffill, which propagates last valid observation forward to next valid. And with that the data should be mostly cleaned
data.head(16)

Strategy 2: Fill Without Deleting any rows, just filling in the blanks.

In [None]:
# We just fill the NaN values without taking care of rows with a lot of nulls
data.fillna(method='ffill', inplace = True)
data.head(35)

Transformation of Data.

In [97]:
# We proceed to study first the categorical camps to know what transformation to do:

# There are 4 races evaluated
print('Races: ' + str(len(data['race'].unique())))
# 4 educational situations evaluated
print('Education: ' + str(len(data['education'].unique())))
# Binary value for sex
print('Sex: ' + str(len(data['sex'].unique())))
# 3 poverty situations
print('Poverty: ' + str(len(data['income_poverty'].unique())))
# A binary marital status
print('Marital: ' + str(len(data['marital_status'].unique())))
# A binary camp for rent or ownship of the house
print('Rent or Own: ' + str(len(data['rent_or_own'].unique())))
# 3 status for employment
print('Employment: ' + str(len(data['employment_status'].unique())))
# 3 values for the census
print('Census_MSA: ' + str(len(data['census_msa'].unique())))

Races: 4
Education: 4
Sex: 2
Poverty: 3
Marital: 2
Rent or Own: 2
Employment: 3
Census_MSA: 3


In [98]:
# I'm going to transform all the binary valus with sklearn LabelBinarizer
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
data['sex'] = lb.fit_transform(data['sex'])
# This makes female a 0, and male a 1 in a binary value.
data['marital_status'] = lb.fit_transform(data['marital_status'])
# Not married is 0, and married is 1
data['rent_or_own'] = lb.fit_transform(data['rent_or_own'])
# Own is 0, rent is 1
data.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,< 12 Years,White,0,Below Poverty,1,0,Not in Labor Force,Non-MSA,0.0,0.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,12 Years,White,1,Below Poverty,1,1,Employed,"MSA, Not Principle City",0.0,0.0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,College Graduate,White,1,"<= $75,000, Above Poverty",1,0,Employed,"MSA, Not Principle City",2.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,12 Years,White,0,Below Poverty,1,1,Not in Labor Force,"MSA, Principle City",0.0,0.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Some College,White,0,"<= $75,000, Above Poverty",0,0,Employed,"MSA, Not Principle City",1.0,0.0
