In [None]:
%config Completer.use_jedi = False

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### EDA

Features, statistics and correlations.

In [None]:
filepath ="../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv"
heart_features = pd.read_csv(filepath)
heart_features.head(3)

In [None]:
heart_features.describe()

In [None]:
heart_features.info()

In [None]:
heart_features['time'].plot()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
plt.style.use('ggplot') # default plot style.

from scipy import stats
from scipy.stats import norm
numeric_columns = ['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure',
                  'platelets','serum_creatinine','serum_sodium','sex',
                  'smoking','time','DEATH_EVENT']

In [None]:
corr_data = heart_features.loc[:, numeric_columns].corr()

plt.figure(figsize=(20,12))
sns.heatmap(corr_data, annot=True, fmt='.3f',cmap='coolwarm',square=True)
plt.show()

### Feature Engineering

This is the most important part, here we use the PCA, create the *vulnerability* and the *stress* feature. With this the model improves from 74% to 81%.

In [None]:
heart_features['vulnerability'] = heart_features['age']/heart_features['time']


In [None]:
heart_features['stress'] = (heart_features['smoking']+
                            heart_features['serum_creatinine']+
                            heart_features['high_blood_pressure'])/heart_features['time']


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
my_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=2)),
])

hf_red = my_pipe.fit_transform(heart_features.drop(['DEATH_EVENT'],axis=1))

In [None]:
hf_full = pd.concat([pd.DataFrame(hf_red),heart_features], axis=1)

### Modelling

In [None]:
X = hf_full.drop(['DEATH_EVENT'], axis=1)
y = hf_full[["DEATH_EVENT"]]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, shuffle=True, random_state=42)

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print (np.asarray((unique, counts)).T)

In [None]:
prop=counts[0]/counts[1] #sin oversample
prop

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

### XGBoost


In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
dtrain = xgb.DMatrix(
        X_train,
        y_train    )

dtest = xgb.DMatrix(
        X_test,
        y_test    )

In [None]:

trained_model = xgb.train(
                        {
                          'eta': 0.1,#0.1 learning rate 0.01 clasico               
                          'colsample_bytree' : 1, #? 0.1 best 0.8 previus, 0.5 better than 0.8, 0.1 worst roc,1 best ROC
                          'sample_type': 'weighted',
                          'min_child_weight':1,#1 is the default
                          'max_delta_step':1,#0 for imbalanced data, [1,10], 1 is the best
                          'max_depth': 20,#10-precavido, 30 más auc con 5 tambien fue bueno, 0 FN y 32409, a mayor profundidad mejor va siendo el modelo 50 max
                          'subsample': 0.8,#0.8
                          'objective': 'binary:logistic',#classificator
                          'n_estimators':1,#10,100*,1000 es lo mismo
                          'scale_pos_weight':174.85470085470087,#prop entre label:1 y label:0
                          'num_parallel_tree':2,#1 lo traje de los 2 en paralelo, 2 fue mejor FP,15 ha sido el mejor, ya 30 empeora
                          'gamma': 10,#20
                          'alpha' : 20,#10 velocity
                          'lambda': 50,#50 overfitting L2 regularization
                          'silent': True,
                          'verbose_eval': False,
                          'tree_method':'hist',#auto? hist is very good  
                          'grow_policy':'depthwise',#default depthwise, only aviable with hist tree
                          'max_bin':200   #default 256,more, better splits, worst computing time             
                        },
                        dtrain,
                        num_boost_round=100, evals=[(dtrain, 'train'),(dtest,'test')])#,early_stopping_rounds=120)#100-140   early stopping = 10% total epochs(50/500)
                          #1000

In [None]:
prediction = trained_model.predict(dtest)

In [None]:
from xgboost import plot_importance, plot_tree

plot_importance(trained_model, max_num_features=12)

### Threshold selection

Find the best threshold.

In [None]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot

fpr, tpr, thresholds = roc_curve(y_test, prediction)
# calculate the g-mean for each threshold
gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='XGBoost')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
def threshold(predictions,th):
  """
  predictions: array con los valores de las predicciones de clases, no booleano
  th: threshold desde el cual decimos si la predicción es 1 o es 0
  Esta función nos permite ajustar el threshold de las predicciones para hacer el modelo más relajado o precavido
  """
  pred =np.zeros(len(predictions))
  for i in range(len(predictions)):
    if (predictions[i]<=th):
      pred[i]=0
    else:
      pred[i]=1
  return pred
predictions = threshold(prediction,thresholds[ix])

### Metrics

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, predictions)
print("Auc en el test : ", auc) #0.719, con pca2 y f 0.805

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, predictions)
print("Accuracy on the test: ", acc)#0.727, con pca 2 y feature 0.808

In [None]:
from sklearn.metrics import confusion_matrix 
confusion_matrix(y_test, predictions)