# This notebook is derived from the fantastic notebook by Andrada
https://www.kaggle.com/andradaolteanu/wids-datathon-rapids-fancy-impute-xgboost

Please upvote that notebook if you like this!

### Libraries 📚

In [None]:
# CPU Libraries
import os
import random
import warnings
import pandas as pd
import numpy as np

import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from fancyimpute import KNN, IterativeImputer

seed = 123
random.seed(seed)
np.random.seed(seed)

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

icecream = ["#CCD4BF", "#E7CBA9", "#EEBAB2", "#F5F3E7", "#F5E2E4"]
sns.palplot(sns.color_palette(icecream))

# 2. Import Data

* `TrainingWiDS2021.csv` - train data.
* `UnlabeledWiDS2021.csv` - test data; the variable to predict is `diabetes_mellitus` (on encounters).
* `SolutionTemplateWiDS2021.csv` - a list of all the rows (and encounters) that should be in your submissions.
* `DataDictionaryWiDS2021.csv` - columns descriptors.

> There are 181 columns in total, with 160/181 containing some sort of missing information. Moreover, 74 out of these 160 columns have missing data more than 50% of the cases. Hence, we'll drop these, as attempting to do any sort of imputation might induce a big bias in the data.

In [None]:
# Read in data
trainfile = "../input/widsdatathon2021/TrainingWiDS2021.csv"
testfile = "../input/widsdatathon2021/UnlabeledWiDS2021.csv"

In [None]:
test = pd.read_csv(testfile)
encounter_id_test =  test['encounter_id'].values

# 3. Feature Engineering and Selection using featurewiz library

> There are 180 columns in data set - let's try selecting the best features!

In [None]:
target = 'diabetes_mellitus'

# Now install featurewiz to see which features to select

In [None]:
!pip install featurewiz

In [None]:
!pip install xlrd

In [None]:
from featurewiz import featurewiz

In [None]:
output = featurewiz(dataname=trainfile, target=target, corr_limit=0.70,
                    verbose=2, sep=',', header=0, test_data=testfile,
                    feature_engg=['target'], category_encoders='')

# featurewiz engineered 170 additional features using Target encoding and of the total 325 features, selected 78 features in less than 10 minutes

In [None]:
## there are two parts to the tuple output.
# One is the new train dataset and other is new test dataset
len(output)

# 3. Let us now use Auto_ViML on the new Train and Test datasets with selected 78 features

In [None]:
!pip install autoviml

In [None]:
from autoviml.Auto_ViML import Auto_ViML

## Let us try to use the features from featurewiz and feed them to AutoVIML



In [None]:
idcol = "encounter_id"

In [None]:
encounter_id_test = test[idcol].values

In [None]:
traindf, testdf = output[0], output[1]
print(traindf.shape, testdf.shape)

In [None]:
target

In [None]:
### we will turn off feature_reduction since we have already done that!
m, features, trainm, testm = Auto_ViML(traindf, target, testdf,
                            sample_submission='',
                            scoring_parameter='', KMeans_Featurizer=False,
                            hyper_param='RS',feature_reduction=False,
                             Boosting_Flag='CatBoost', Binning_Flag=False,
                            Add_Poly=0, Stacking_Flag=False,Imbalanced_Flag=False,
                            verbose=2)

# Auto_ViML just took 3 minutes to build a model with 87 AUC.

# Let's make a submission

In [None]:
testm.head(1)

In [None]:
# Add encounter_id and diabetes_mellitus back to the dataframes
subm = pd.DataFrame([], columns = [idcol,target])
subm[target] = testm[target+'_proba_1'].values
subm[idcol] = encounter_id_test
subm.head()

In [None]:
subm.to_csv('/kaggle/working/submission.csv', index=False)

# Download the submission file and submit it!

## Obtained Score 0.8512 and Rank 194 on Kaggle WIDS-2021 - not bad with Automated ML!