# 🤖 Modeling

### Authors:
| Name                          | Github user                                        |
|-------------------------------|----------------------------------------------------|
| Sergio Herreros Fernández     | [@SergioHerreros](https://github.com/SERGI0HERREROS)|
| Francisco Javier Luna Ortiz   | [@Lunao01](https://github.com/Lunao01)|
| Carlos Romero Navarro         | [@KarManiatic](https://github.com/KarManiatic)|
| Tatsiana Shelepen             | [@Naschkatzee](https://github.com/Naschkatzee) | 

<br>

## 1. Data

In [7]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")

# Data
training_set_features_df = pd.read_csv('gold/training_set_features_df.csv') # training set features

training_set_labels_df = pd.read_csv('data/training_set_labels.csv') # training set labels

test_set_features_df = pd.read_csv('data/test_set_features.csv') # test set features

<br>

## 2. Approach

### 2.1. RandomForestClassifier

Modelling.

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer

## RandomForestClassifier
rf_classifier_h1n1_vaccine = RandomForestClassifier(random_state = 0, n_estimators = 100,
                                    criterion = 'entropy')
rf_classifier_h1n1_vaccine.fit(training_set_features_df.iloc[:, 1:], training_set_labels_df['h1n1_vaccine'])

In [9]:
object_type_columns = test_set_features_df.select_dtypes(include='object').columns

for col in object_type_columns:
    unique_values = test_set_features_df[col].unique()
    
    # Create a mapping dictionary
    mapping = {value: idx for idx, value in enumerate(unique_values)}
    mapping[np.nan] = -1 

    print(mapping) # Print the mapping dictionary

    # Apply the mapping
    test_set_features_df[col] = test_set_features_df[col].map(mapping)

    # Replace -1 values back to NaN 
    test_set_features_df[col] = test_set_features_df[col].replace(-1, np.nan)


imp_knn = KNNImputer(n_neighbors=70,weights="distance")
imputed_data = imp_knn.fit_transform(test_set_features_df)
test_set_features_df = pd.DataFrame(imputed_data, columns=test_set_features_df.columns)
test_set_features_df.head(70)


# Se eliminan las columnas con muchos valores nulos
test_set_features_df.drop(columns=['employment_occupation', 'employment_industry', 'health_insurance'], inplace=True)

{'35 - 44 Years': 0, '18 - 34 Years': 1, '55 - 64 Years': 2, '65+ Years': 3, '45 - 54 Years': 4, nan: -1}
{'College Graduate': 0, '12 Years': 1, 'Some College': 2, '< 12 Years': 3, nan: -1}
{'Hispanic': 0, 'White': 1, 'Black': 2, 'Other or Multiple': 3, nan: -1}
{'Female': 0, 'Male': 1, nan: -1}
{'> $75,000': 0, 'Below Poverty': 1, '<= $75,000, Above Poverty': 2, nan: -1}
{'Not Married': 0, 'Married': 1, nan: -1}
{'Rent': 0, 'Own': 1, nan: -1}
{'Employed': 0, 'Not in Labor Force': 1, 'Unemployed': 2, nan: -1}
{'mlyzmhmf': 0, 'bhuqouqj': 1, 'lrircsnp': 2, 'lzgpxyit': 3, 'fpwskwrf': 4, 'oxchjgsf': 5, 'dqpwygqj': 6, 'qufhixun': 7, 'kbazzjca': 8, 'atmpeygn': 9, nan: -1}
{'MSA, Not Principle  City': 0, 'Non-MSA': 1, 'MSA, Principle City': 2, nan: -1}
{'atmlpfrs': 0, 'nduyfdeo': 1, nan: -1, 'fcxhlnwr': 3, 'pxcmvdjn': 4, 'arjwrbjb': 5, 'mfikgejo': 6, 'rucpziij': 7, 'wxleyezf': 8, 'haxffmxo': 9, 'ldnlellj': 10, 'vjjrobsf': 11, 'cfqqtusy': 12, 'xicduogh': 13, 'dotnnunm': 14, 'xqicxuve': 15, 'wl