## Feature engineering to eliminate multicollinearity and to choose features based on target 

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import numpy as np

import matplotlib as plt

In [None]:
df = pd.read_csv('../data/Flu_Shot_Data_cleaned_2.csv')

In [None]:
df.info()

In [None]:
# getting the highly correlated features
s = df.corr(method='spearman').unstack().sort_values(kind="quicksort", ascending=False).drop_duplicates()
print(s[1:10])

- This notebook deals with highly correlated features 
- We will apply two strategies:
    - binary features will be combined into categories (4 combinations)
    - non-binary features will be dropped according to the target variable

In [None]:
# maybe apply chi squared test on binary variables? 

### Creating a new column based on recommendation (2 binary features)
- applying a function that tells if both, none or either of the vaccines have been recommended

In [None]:
def get_reco(row):
    # none of the vaccines has been recommended
    if row['doctor_recc_seasonal'] == 0 and row['doctor_recc_h1n1'] == 0:
        val = 0
    # both vaccines have been recommended
    elif row['doctor_recc_seasonal'] == 1 and row['doctor_recc_h1n1'] == 1:
        val = 1
    # only seasonal vaccine has been recommended 
    elif row['doctor_recc_seasonal'] == 1 and row['doctor_recc_h1n1'] == 0:
        val = 2
    # only H1N1 vaccine has been recommended
    elif row['doctor_recc_seasonal'] == 0 and row['doctor_recc_h1n1'] == 1:
        val = 3
    else:
        val = None
    return val 

In [None]:
# Applying the function
# values will be converted to categorical 
df['reco_vaccines'] = df.apply(get_reco, axis=1).astype('category')

In [None]:
df['reco_vaccines'].value_counts()

In [None]:
print(df.behavioral_large_gatherings.value_counts())
print(df.behavioral_outside_home.value_counts())

### Creating a new column based on 2 behavioural features

- next, we will create a function that combines behaviour at large gatherings and outside of home
- NOTE: we need to make an assumption that '0' refers to not having done it and '1' refers to having done it 

In [None]:
def get_behaviour(row):
    # none of the behaviours applies 
    if row['behavioral_large_gatherings'] == 0 and row['behavioral_outside_home'] == 0:
        val = 0
    # both behaviours apply 
    elif row['behavioral_large_gatherings'] == 1 and row['behavioral_outside_home'] == 1:
        val = 1
    # only reducing time at large gatherings applies 
    elif row['behavioral_large_gatherings'] == 1 and row['behavioral_outside_home'] == 0:
        val = 2
    # only reduced contact with people outside of household applies 
    elif row['behavioral_large_gatherings'] == 0 and row['behavioral_outside_home'] == 1:
        val = 3
    else:
        val = None
    return val 

In [None]:
# Applying the function
# values will be converted to categorical 
df['behaviour_gather_home'] = df.apply(get_behaviour, axis=1).astype('category')

In [None]:
df.behaviour_gather_home.value_counts()

- we will now drop the columns that are no longer necessary 

In [None]:
removal = ['doctor_recc_seasonal', 'doctor_recc_h1n1', 'behavioral_large_gatherings', 'behavioral_outside_home']

df.drop(removal, axis=1, inplace=True)

### Removal of feature depending on target

- apply the following code if H1N1 is the single target variable 
- we take out the feature that is related to seasonal flu/vaccine

In [None]:
list_seas = ['opinion_seas_risk', 'opinion_seas_sick_from_vacc']

df.drop(list_seas, axis=1, inplace=True)

- apply the following code if seasonal vaccine is the single target variable 
- we take out the feature that is related to H1N1 flu/vaccine

In [None]:
list_h1n1 = ['opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc']

df.drop(list_h1n1, axis=1, inplace=True)

## Applying principal component analysis in order to reduce features

In [None]:
X = df.drop('h1n1_vaccine', axis=1)
X = df.drop('seasonal_vaccine', axis=1)
y = df['h1n1_vaccine']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

In [None]:
X_train_encoded.shape

In [None]:
# in order to perform PCA, we need to drop missing values
# we will impute missing values with the mode

In [None]:
df.replace({'nan': np.nan}, inplace=True)

In [None]:
# Applying simple imputer to deal with missing values 
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train_encoded_filled = imp_mode.fit_transform(X_train_encoded)
X_train_encoded_filled

In [None]:
df_encoded = pd.DataFrame(data=X_train_encoded_filled)
df_encoded.info()

In [None]:
pca = PCA(n_components=5)
df_encoded_pca = pca.fit(df_encoded)

In [None]:
X_train_scaled_trans = pca.transform(df_encoded)


In [None]:
print(pca.explained_variance_ratio_)

In [None]:
X_train_scaled_trans = pd.DataFrame(data=X_train_scaled_trans)
X_train_scaled_trans.head(5)