In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/heart_disease_health_indicators/heart.csv")

In [4]:
# Mostrar los datos de nuestro dataset
display(df.head(10))
print(20*"-","COLUMNS",20*"-",'\n',df.columns.tolist(),"\n\n",20*"-","INFO",20*"-")
display(df.info())
print(f"------------\nShape of dataframe: {df.shape}")

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


-------------------- COLUMNS -------------------- 
 ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'] 

 -------------------- INFO --------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3

None

------------
Shape of dataframe: (303, 14)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

target='output'
X = df.drop(target, axis = 1)
y = df[target]

n = len(X)
valid = (X[:int(0.3*n)+1],y[:int(0.3*n)+1]) # saparamos nuestros datos para validación

#separamos nuestros datos para entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X[:int(0.7*n)], y[:int(0.7*n)], test_size = 0.3, random_state = 42)

In [13]:
# tratamiento de datos faltantes (si hay)
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns=X.columns)
X_test = pd.DataFrame(imputer.transform(X_test),columns=X.columns)

In [14]:
# Devuelve las variables binarias
binary_vars = [col for col in df.columns if df[col].nunique() == 2]

# Devuelve las variables categoricas
categorical_vars = [col for col in df.columns if (df[col].nunique() < 11 and col not in binary_vars)]

# devuelve las columnas que no estan escaladas (excluyendo las binarias y categoricas)
cols2scale = [col for col in df.columns if col not in binary_vars + categorical_vars]

# Escalado de caracteristicas
scaler = RobustScaler()
X_train[cols2scale] = scaler.fit_transform(X_train[cols2scale])
X_test[cols2scale] = scaler.transform(X_test[cols2scale])

In [16]:
#usando RF obtenemos las mejores caracteristicas, aun usando todas
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rfc.feature_importances_})

top_features = importance_df.sort_values(by='Importance', ascending=False).head(14)
print(top_features)

     Feature  Importance
0    oldpeak    0.163341
2         cp    0.132794
4        caa    0.125067
1      thall    0.106240
3   thalachh    0.099884
5        age    0.075551
6       chol    0.069762
7       exng    0.063102
8     trtbps    0.055448
10       slp    0.043694
9        sex    0.033436
11   restecg    0.020542
12       fbs    0.011139


In [18]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)

print(mi_scores)

thall       0.172654
caa         0.138622
cp          0.119642
exng        0.101545
oldpeak     0.094359
chol        0.079485
thalachh    0.070124
sex         0.058485
slp         0.048567
age         0.007548
trtbps      0.000000
fbs         0.000000
restecg     0.000000
Name: MI Scores, dtype: float64
