**PCA**

In [1]:
import pandas as pd

df = pd.read_csv('/content/sample_data/heart.csv')
display(df.head())

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [2]:
print("Shape of the DataFrame:", df.shape)

print("\nData Types of each column:\n", df.dtypes)

print("\nMissing Values:\n", df.isnull().sum())

print("\nDescriptive statistics for numerical features:\n", df.describe())

Shape of the DataFrame: (918, 12)

Data Types of each column:
 Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

Missing Values:
 Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

Descriptive statistics for numerical features:
               Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  918.000000  918.000000   918.000000  918.000000  918.000000   
mean    53.510893  132.396514   198.799564    0.233115  136.809368   
std      9.432617   18.514154   109.384145    0.423046   25.460334   
min     28.

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd
import numpy as np

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    if col == 'HeartDisease':
        if df['HeartDisease'].dtype == 'object':
            df['HeartDisease'] = LabelEncoder().fit_transform(df['HeartDisease'])
        continue
    if col in ['Sex', 'ExerciseAngina']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    else:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_features = ohe.fit_transform(df[[col]])
        encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out([col]))
        df = pd.concat([df, encoded_df], axis=1).drop(col, axis=1)

display(df.head())
print(df.dtypes)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,0,160,180,0,156,0,1.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,1,130,283,0,98,0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,0,138,214,0,108,1,1.5,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,54,1,150,195,0,122,0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


Age                    int64
Sex                    int64
RestingBP              int64
Cholesterol            int64
FastingBS              int64
MaxHR                  int64
ExerciseAngina         int64
Oldpeak              float64
HeartDisease           int64
ChestPainType_ASY    float64
ChestPainType_ATA    float64
ChestPainType_NAP    float64
ChestPainType_TA     float64
RestingECG_LVH       float64
RestingECG_Normal    float64
RestingECG_ST        float64
ST_Slope_Down        float64
ST_Slope_Flat        float64
ST_Slope_Up          float64
dtype: object


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

svm_model = svm.SVC()
lr_model = LogisticRegression()
rf_model = RandomForestClassifier()

svm_model.fit(X_train_scaled, y_train)
lr_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)

trained_models = {
    'svm': svm_model,
    'logistic_regression': lr_model,
    'random_forest': rf_model
}

In [7]:
from sklearn.metrics import accuracy_score

accuracy_scores = {}

for model_name, model in trained_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy
    print(f"Accuracy of {model_name}: {accuracy}")

Accuracy of svm: 0.8967391304347826
Accuracy of logistic_regression: 0.8858695652173914
Accuracy of random_forest: 0.8695652173913043


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)

pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Number of components selected by PCA: {pca.n_components_}")
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_)}")

Number of components selected by PCA: 14
Explained variance ratio: 0.9769812049153127


In [9]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

svm_model_pca = svm.SVC()
lr_model_pca = LogisticRegression()
rf_model_pca = RandomForestClassifier()

svm_model_pca.fit(X_train_pca, y_train)
lr_model_pca.fit(X_train_pca, y_train)
rf_model_pca.fit(X_train_pca, y_train)

trained_models_pca = {
    'svm_pca': svm_model_pca,
    'logistic_regression_pca': lr_model_pca,
    'random_forest_pca': rf_model_pca
}

In [10]:
from sklearn.metrics import accuracy_score

accuracy_scores_pca = {}
for model_name, model in trained_models_pca.items():
    y_pred_pca = model.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)
    accuracy_scores_pca[model_name] = accuracy_pca
    print(f"Accuracy of {model_name}: {accuracy_pca}")

print("\nAccuracy Comparison (Before vs. After PCA):")
for model_name in trained_models_pca:
    original_model_name = model_name.replace("_pca", "")
    if original_model_name in accuracy_scores:
      accuracy_diff = accuracy_scores_pca[model_name] - accuracy_scores[original_model_name]
      print(f"{model_name}:")
      print(f"  Accuracy before PCA: {accuracy_scores[original_model_name]}")
      print(f"  Accuracy after PCA: {accuracy_scores_pca[model_name]}")
      print(f"  Difference: {accuracy_diff}")
    else:
      print(f"Warning: Could not find original model '{original_model_name}' in accuracy_scores dictionary.")


Accuracy of svm_pca: 0.8913043478260869
Accuracy of logistic_regression_pca: 0.8858695652173914
Accuracy of random_forest_pca: 0.8913043478260869

Accuracy Comparison (Before vs. After PCA):
svm_pca:
  Accuracy before PCA: 0.8967391304347826
  Accuracy after PCA: 0.8913043478260869
  Difference: -0.005434782608695676
logistic_regression_pca:
  Accuracy before PCA: 0.8858695652173914
  Accuracy after PCA: 0.8858695652173914
  Difference: 0.0
random_forest_pca:
  Accuracy before PCA: 0.8695652173913043
  Accuracy after PCA: 0.8913043478260869
  Difference: 0.021739130434782594


In [11]:
pca_components = pca.components_

original_features = X_train.columns

pca_component_df = pd.DataFrame(pca_components, columns=original_features)

display(pca_component_df)

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,-0.243408,-0.148614,-0.124472,0.114883,-0.156725,0.314372,-0.343272,-0.285119,-0.339271,0.297388,0.11709,0.02416,-0.033613,0.130874,-0.128051,-0.11746,-0.355813,0.417806
1,-0.239518,0.099566,-0.163713,-0.137874,-0.079674,-0.11886,0.100004,-0.044997,0.174459,0.042202,-0.158308,-0.170021,-0.503079,0.637329,-0.279686,-0.090258,0.125663,-0.082344
2,-0.094558,-0.195402,0.076076,0.523515,-0.354407,0.209982,0.142886,0.327661,0.098743,0.039686,-0.184159,0.04674,0.310516,0.095787,-0.433288,0.09975,0.083427,-0.133707
3,0.051103,-0.10468,0.104341,0.007204,-0.040545,-0.014542,-0.057144,-0.0531,-0.363932,-0.212782,0.590518,0.100209,0.037819,0.126255,-0.19471,-0.385628,0.417006,-0.230628
4,0.143574,0.119593,-0.02649,-0.190461,0.255371,0.012217,-0.068143,0.242024,-0.212634,-0.13308,0.229953,0.280433,0.016086,0.251348,-0.327638,0.60188,-0.266924,-0.028125
5,0.342868,-0.307687,0.558824,0.139523,0.0472,-0.165589,0.027191,0.090607,-0.229999,0.373237,-0.167872,0.167171,-0.346602,0.162381,0.150065,0.007871,-0.001717,-0.00216
6,-0.130947,0.133708,-0.066067,-0.135934,0.211004,0.10755,-0.194474,-0.120281,-0.01596,0.078686,-0.446462,0.703053,0.116562,-0.014272,-0.100431,-0.209711,0.224904,-0.123528
7,-0.357078,-0.029191,-0.172404,0.17707,-0.176786,0.250082,0.113815,0.300906,-0.118627,-0.169504,0.133742,0.309698,-0.418057,-0.059217,0.496958,0.141902,0.031034,-0.101613
8,0.029557,0.555127,0.452103,0.096114,-0.351453,-0.083499,0.052829,-0.061029,0.114984,-0.326823,0.044362,0.227581,-0.044657,0.060485,-0.029672,-0.161694,-0.214216,0.296574
9,-0.129867,0.63655,0.040593,0.043541,0.044795,0.074418,0.173201,0.145254,-0.357066,0.536647,0.038034,-0.204211,0.080513,-0.080348,0.017944,0.029224,0.138436,-0.1544
