# **Prepare**

## Display setting

In [None]:
# %pip install pandas

In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_columns', None)

## Import dataset

Dataset path

In [None]:
train_feature_path = "./dataset/training-set-features.csv"
train_label_path = "./dataset/training-set-labels.csv"
test_path = "./dataset/test-set-features.csv"
submission_path = "./dataset/submission-format.csv"

Import train data

In [None]:
x_data = pd.read_csv(train_feature_path, index_col='respondent_id')
y_data = pd.read_csv(train_label_path, index_col='respondent_id')

Create a backup of train data

In [None]:
x_data_backup = x_data.copy()
y_data_backup = y_data.copy()

Restore train data from backup data

In [None]:
x_data = x_data_backup.copy()
y_data = y_data_backup.copy()

## Dataset info

Some samples in train data

In [None]:
x_data

In [None]:
y_data

Information about train data (column name, data type, non-null values number)

In [None]:
x_data.info()

In [None]:
y_data.info()

Lists of values of each column

In [None]:
for col in x_data.columns:
    print(x_data[col].value_counts())
    print()

In [None]:
for col in y_data.columns:
    print(y_data[col].value_counts())
    print()

Numbers of missing values in each column

In [None]:
missValData = []

for column in y_data.columns:
    missValData.append([column, len(y_data) - y_data[column].count()])
for column in x_data.columns:
    missValData.append([column, len(x_data) - x_data[column].count()])

missValData = pd.DataFrame(missValData, columns=['column_name', 'missing_value_count'])
missValData

Lists of necessary variables

In [None]:
cat_values_list = {}
for col in x_data.columns:
    values_list = []
    for val in x_data[col].value_counts().index.sort_values():
        values_list.append(val)
    cat_values_list.update({col: values_list})

for col in y_data.columns:
    values_list = []
    for val in y_data[col].value_counts().index.sort_values():
        values_list.append(val)
    cat_values_list.update({col: values_list})

CAT_VALUES_LIST = cat_values_list.copy()

# **Proposed system**

## Data wrangler

### "Cat2Num" converter

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
def convertToNum(data: pd.DataFrame):
    for col in data.columns:
        values_list = cat_values_list.get(col)
        len_values_list = len(values_list)
        for i in range(len_values_list):
            data[col].replace(to_replace=values_list[i], value=i, inplace=True)
        
        values_list = [i for i in range(len_values_list)]
        cat_values_list.update({col: values_list})
    return data

Cat2NumConverter = FunctionTransformer(convertToNum)

### Missing values imputer

In [None]:
from sklearn.impute import KNNImputer
import numpy as np

In [None]:
def missing_values_imputer(data: pd.DataFrame):
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=1)
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data

Imputer = FunctionTransformer(missing_values_imputer)

### Values standardizer

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
def standardizer(data: pd.DataFrame):
    categories_list = []
    for col in data.columns:
        categories_list.append(cat_values_list.get(col))

    encoder = OneHotEncoder(categories=categories_list, sparse=False, drop='if_binary')
    data = pd.DataFrame(encoder.fit_transform(data))

    return data

Standardizer = FunctionTransformer(standardizer)

### Completed component

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
DataWrangler = Pipeline(
    steps=[
        ('converter', Cat2NumConverter),
        ('imputer', Imputer),
        ('standardizer', Standardizer)
    ]
)

In [None]:
# DataWrangler.fit_transform(x_data, y_data)

## Features selector

## Predictor

In [None]:
# from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
estimators = MultiOutputClassifier(
    estimator=SVC(probability=True)
)

## Completed system

In [None]:
completed_pl = Pipeline(
    steps=[
            ("preprocessor", DataWrangler), 
            ("estimator", estimators)
    ]
)

Split dataset into train, test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
RANDOM_SEED = 6

x_train, x_test, y_train, y_test = train_test_split(
    x_data,
    y_data,
    test_size=0.3,
    shuffle=True,
    stratify=y_data,
    random_state=RANDOM_SEED
)

In [None]:
completed_pl.fit(x_train, y_train)

In [None]:
cat_values_list = CAT_VALUES_LIST.copy()

In [None]:
preds = completed_pl.predict_proba(x_test)

In [None]:
preds

In [None]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_test.index
)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, y_preds)

In [None]:
completed_pl.get_params()


(Function) Convert non-numeric values into numeric ones

In [None]:
# def convertToNum(data: pd.DataFrame, colList: list):
#     for col in colList:
#         value_list = data[col].value_counts().index.sort_values(ascending=True)
#         for i in range(len(value_list)):
#             data[col].replace(to_replace=value_list[i], value=i, inplace=True)

In [None]:
# convertTable = {}
# for col in x_data.select_dtypes('object').columns:
#     convertTable.update({col: x_data[col].value_counts().index.sort_values(ascending=True)}) 

In [None]:
# convertToNum(train_data, nnum)

Visualize result

In [None]:
# y_preds = pd.DataFrame(
#     {
#         "h1n1_vaccine": preds[0][:, 1],
#         "seasonal_vaccine": preds[1][:, 1],
#     },
#     index = y_eval.index
# )
# print("y_preds.shape:", y_preds.shape)
# y_preds.head()

In [None]:
# import matplotlib.pyplot as plt

In [None]:
# def plot_roc(y_true, y_score, label_name, ax):
#     fpr, tpr, thresholds = roc_curve(y_true, y_score)
#     ax.plot(fpr, tpr)
#     ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
#     ax.set_ylabel('TPR')
#     ax.set_xlabel('FPR')
#     ax.set_title(
#         f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
#     )

In [None]:
# fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

# plot_roc(
#     y_eval['h1n1_vaccine'], 
#     y_preds['h1n1_vaccine'], 
#     'h1n1_vaccine',
#     ax=ax[0]
# )
# plot_roc(
#     y_eval['seasonal_vaccine'], 
#     y_preds['seasonal_vaccine'], 
#     'seasonal_vaccine',
#     ax=ax[1]
# )
# fig.tight_layout()

In [None]:
# roc_auc_score(y_eval, y_preds)

Make subbmission

In [None]:
# test_features_df = pd.read_csv(test_path, index_col="respondent_id")

In [None]:
# preprocessing(test_features_df)
# test_probas = full_pipeline.predict_proba(test_features_df[h1n1_corel])

In [None]:
# submission_df = pd.read_csv(submission_path, index_col="respondent_id")

In [None]:
# np.testing.assert_array_equal(test_features_df.index.values, 
#                               submission_df.index.values)

In [None]:
# submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
# submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

In [None]:
# submission_df.to_csv('my_submission.csv', index=True)