In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
import os
os.chdir("../functions")
from feature import features_equal_mean, features_high_corr, features_chi2, binary_features, features_equal_var_mean
from plot import plot_proportion_target, labs , plot_ecdf, labs
os.chdir("../notebooks")

In [4]:
help(plot_proportion_target)

Help on function plot_proportion_target in module plot:

plot_proportion_target(data, target, col, ax, kwargs_labs={})



In [5]:
help(binary_features)

Help on function binary_features in module feature:

binary_features(data)



In [6]:
data = pd.read_csv('../data/preprocessed_data/Kaggle_Sirio_Libanes_ICU_Prediction.csv')

In [7]:
features_equal_var_mean(data)

['ALBUMIN_DIFF',
 'BE_ARTERIAL_DIFF',
 'BE_VENOUS_DIFF',
 'BIC_ARTERIAL_DIFF',
 'BIC_VENOUS_DIFF',
 'BILLIRUBIN_DIFF',
 'BLAST_DIFF',
 'CALCIUM_DIFF',
 'CREATININ_DIFF',
 'FFA_DIFF',
 'GGT_DIFF',
 'GLUCOSE_DIFF',
 'HEMATOCRITE_DIFF',
 'HEMOGLOBIN_DIFF',
 'INR_DIFF',
 'LACTATE_DIFF',
 'LEUKOCYTES_DIFF',
 'LINFOCITOS_DIFF',
 'NEUTROPHILES_DIFF',
 'P02_ARTERIAL_DIFF',
 'P02_VENOUS_MEDIAN',
 'P02_VENOUS_MEAN',
 'P02_VENOUS_MIN',
 'P02_VENOUS_MAX',
 'P02_VENOUS_DIFF',
 'PC02_ARTERIAL_DIFF',
 'PC02_VENOUS_DIFF',
 'PCR_DIFF',
 'PH_ARTERIAL_MEDIAN',
 'PH_ARTERIAL_MEAN',
 'PH_ARTERIAL_MIN',
 'PH_ARTERIAL_MAX',
 'PH_ARTERIAL_DIFF',
 'PH_VENOUS_MEDIAN',
 'PH_VENOUS_MEAN',
 'PH_VENOUS_MIN',
 'PH_VENOUS_MAX',
 'PH_VENOUS_DIFF',
 'PLATELETS_DIFF',
 'POTASSIUM_DIFF',
 'SAT02_ARTERIAL_DIFF',
 'SAT02_VENOUS_MEDIAN',
 'SAT02_VENOUS_MEAN',
 'SAT02_VENOUS_MIN',
 'SAT02_VENOUS_MAX',
 'SAT02_VENOUS_DIFF',
 'SODIUM_DIFF',
 'TGO_DIFF',
 'TGP_DIFF',
 'TTPA_DIFF',
 'UREA_DIFF',
 'DIMER_DIFF']

## Análise das features binárias

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(6,2,figsize=(30,50)) 
sns.set_palette(['blue','red'])
i=0
j=0
for feature in binary_features(data.drop('ICU',axis=1)):
    plot_proportion_target(data, 'ICU', feature, ax[i,j], kwargs_labs={'title':f'{feature}'})
    if j == 1:
        j = 0
        i += 1
    else:
        j+=1

plt.delaxes(ax=ax[5,1])
plt.show()

In [None]:
features_chi2(data, alpha=0.05)

In [None]:
features_chi2(data, alpha=0.025)

## Análise de correção entre as variáveis

In [None]:
coor = data.select_dtypes('float64').corr()
# Gerando a mascara para omitir a diagonal superior 
mask = np.triu(np.ones_like(coor, dtype=bool))

#Gerando a palheta de cores
cmap = sns.diverging_palette(240, 10, as_cmap=True)

#Criando o gráfico de calor
with sns.axes_style("white"):
    fig, ax = plt.subplots(figsize = (50,20))
    ax = sns.heatmap(coor, mask=mask, square=True, cmap=cmap, linewidths=.5)
    
#Mostrando o gráfico
plt.show()

In [None]:
feat_high_corr = features_high_corr(data)

In [None]:
print(feat_high_corr)

In [None]:
data = data.drop(feat_high_corr, axis=1)

In [None]:
data['RESPIRATORY_RATE_MEAN']

## Análise da Variância das features

In [None]:
var = data.drop(binary_features(data), axis=1).drop('PATIENT_VISIT_IDENTIFIER', axis=1).var()
var = var.sort_values()
var.head()

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(figsize=(20,10))
var.plot(kind='bar')
labs(ax, title='Variância das features', subtitle='VARIÂNCIA CALCULADA PARA CADA FEATURE', xlabel='Features', ylabel='Variância')
plt.show()

In [None]:
with open('../features_drop_txt/var_equal_0.txt', 'w') as f:
    for feature in var[var==0].index:
        f.write(feature+'\n')

In [None]:
data.drop(var[var==0].index,axis=1,inplace=True)

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(46,2, figsize=(20,150))

j=0

for column in data.select_dtypes('float64'):
    if len(data[column].unique()) == 2:
        pass
    else:
        xlim = [np.minimum(data.query('ICU == 0')[column].min(), data.query('ICU == 1')[column].min())-0.01, 
                np.maximum(data.query('ICU == 0')[column].max(), data.query('ICU == 1')[column].max())+0.01]
        plot_ecdf(data, column, 0, 'blue', ax[j,0],xlim=xlim)
        plot_ecdf(data, column, 1, 'red', ax[j,1],xlim=xlim)
        j+=1
fig.suptitle(y=0.89,t='ECDFs DAS FEATURES CONTÍNUAS', fontsize=25)
plt.show()

In [None]:
eq_mean = features_equal_mean(data, write=False)
eq_mean

In [None]:
data.drop(eq_mean, axis=1).columns

In [None]:
data.drop(eq_mean, axis=1).columns

In [None]:
data['RESPIRATORY_RATE_MEAN']

In [None]:
eq_mean = features_equal_mean(data, alpha=0.1)
eq_mean

In [None]:
data.drop(eq_mean, axis=1).columns

In [None]:
from statsmodels.stats.weightstats import CompareMeans, DescrStatsW

In [None]:
group_1_test = DescrStatsW(data.query('ICU == 0')['RESPIRATORY_RATE_MEAN'])
                
#Criando os teste do grupo 2
group_2_test = DescrStatsW(data.query('ICU == 1')['RESPIRATORY_RATE_MEAN'])

#Criando o teste de comparação das médias
test = CompareMeans(group_1_test, group_2_test)
                
#Obtendo o p_valor da Hipótese nula igual as médias serem iguais
p_value = test.ztest_ind()[1]

In [None]:
test.ztest_ind()

In [None]:
p_value

In [None]:
from scipy.stats import levene

In [None]:
levene(data.query('ICU == 0')['RESPIRATORY_RATE_MEAN'], data.query('ICU == 1')['RESPIRATORY_RATE_MEAN'])

In [None]:
levene(data.query('ICU == 0')['PC02_ARTERIAL_MEDIAN'], data.query('ICU == 1')['PC02_ARTERIAL_MEDIAN'])

In [None]:
from statsmodels.stats.oneway import equivalence_oneway

In [None]:
equivalence_oneway([data.query('ICU == 0')['RESPIRATORY_RATE_MEAN'], data.query('ICU == 1')['RESPIRATORY_RATE_MEAN']],0, use_var='equal')