# Load csv file

In [1]:
import pandas as pd

# Load the CSV file
heart_data = pd.read_csv("heart.csv")

# Display the few rows of the dataset
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


# Generate new features for existing ones

In [3]:
# Example new features:
# 1. Age buckets: Categorize age into different age groups
# 2. Cholesterol to age ratio

# Create age buckets
heart_data['age_bucket'] = pd.cut(heart_data['age'], bins=[29, 40, 50, 60, 70, 80], labels=['30-40', '40-50', '50-60', '60-70', '70-80'])

# Cholesterol to age ratio
heart_data['chol_age_ratio'] = heart_data['chol'] / heart_data['age']

# Display the new features
heart_data[['age', 'age_bucket', 'chol', 'chol_age_ratio']].head()

Unnamed: 0,age,age_bucket,chol,chol_age_ratio
0,52,50-60,212,4.076923
1,53,50-60,203,3.830189
2,70,60-70,174,2.485714
3,61,60-70,203,3.327869
4,62,60-70,294,4.741935


# Apply PCA

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Select relevant columns for PCA (excluding non-numeric and target columns)
features = heart_data.drop(columns=['age_bucket', 'target'])

# Standardize the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)

# Create a DataFrame with principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Display explained variance ratio to understand the importance of the components
explained_variance = pca.explained_variance_ratio_
explained_variance


array([0.20369041, 0.12974445])

# Optimize feature sets

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Prepare the dataset for Random Forest
X = heart_data.drop(columns=['age_bucket', 'target'])
y = heart_data['target']

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importance_df)


           Feature  Importance
2               cp    0.122262
11              ca    0.118346
9          oldpeak    0.113846
12            thal    0.108594
7          thalach    0.100788
0              age    0.082732
3         trestbps    0.066425
13  chol_age_ratio    0.065550
4             chol    0.064767
8            exang    0.052301
10           slope    0.041426
1              sex    0.036890
6          restecg    0.017958
5              fbs    0.008114
