In [None]:
pip install pandas==1.2.5




**HEART DISEASE**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

# Dataset details
print("Dataset Loaded")
print(f" Dataset size: {df.shape[0]} rows × {df.shape[1]} columns")
print("\n Column Names:", list(df.columns))
print("\n First 5 rows:")
print(df.head())

# Features (X) and Target (y)
X = df.drop(columns=["target"])
y = df["target"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model: Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy:.4f}")


Dataset Loaded
 Dataset size: 1190 rows × 12 columns

 Column Names: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

 First 5 rows:
   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

# Dataset details
print(" Dataset Loaded")
print(f" Dataset size: {df.shape[0]} rows × {df.shape[1]} columns")
print("\n Column Names:", list(df.columns))
print("\n First 5 rows:")
print(df.head())

# Features and Target
X = df.drop(columns=["target"])
y = df["target"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost Model (tuned)
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    gamma=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy:.4f}")

# Classification Report
print("\n Classification Report:")
print(classification_report(y_test, y_pred))


 Dataset Loaded
 Dataset size: 1190 rows × 12 columns

 Column Names: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

 First 5 rows:
   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 Model Accuracy: 0.9244

 Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       112
           1       0.94      0.92      0.93       126

    accuracy                           0.92       238
   macro avg       0.92      0.92      0.92       238
weighted avg       0.92      0.92      0.92       238



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

print("Dataset Loaded")
print(f"Dataset size: {df.shape[0]} rows × {df.shape[1]} columns")
print("\nColumn Names:", list(df.columns))
print("\nFirst 5 rows:")
print(df.head())

# Features & Target
X = df.drop(columns=["target"])
y = df["target"]

# Scale features for models like SVC & Logistic Regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Base models
estimators = [
    ('lr', LogisticRegression(max_iter=200)),
    ('svc', SVC(probability=True)),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))
]

# Stacking Classifier
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    passthrough=True
)

# Train
stack_model.fit(X_train, y_train)

# Predictions
y_pred = stack_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Loaded
Dataset size: 1190 rows × 12 columns

Column Names: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

First 5 rows:
   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98   

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import numpy as np

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

print("Dataset Loaded")
print(f"Dataset size: {df.shape[0]} rows × {df.shape[1]} columns")
print("\nColumn Names:", list(df.columns))
print("\nFirst 5 rows:")
print(df.head())

# Features & Target
X = df.drop(columns=["target"])
y = df["target"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost Model
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)

# Hyperparameter grid for tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_lambda': [1, 1.5, 2]
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist,
    n_iter=50, scoring='accuracy', cv=5,
    verbose=1, random_state=42, n_jobs=-1
)

# Train with tuning
random_search.fit(X_train, y_train)

# Best model
best_xgb = random_search.best_estimator_
print("\nBest Parameters:", random_search.best_params_)

# Predictions
y_pred = best_xgb.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Loaded
Dataset size: 1190 rows × 12 columns

Column Names: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

First 5 rows:
   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

print("Dataset Loaded")
print(f"Dataset size: {df.shape[0]} rows × {df.shape[1]} columns")
print("\nColumn Names:", list(df.columns))
print("\nFirst 5 rows:")
print(df.head())

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Base models
xgb = XGBClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, gamma=0.2,
    reg_lambda=1, eval_metric="logloss", random_state=42
)

lgb = LGBMClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1, random_state=42
)

cat = CatBoostClassifier(
    iterations=300, depth=5, learning_rate=0.05, l2_leaf_reg=3,
    subsample=0.8, verbose=0, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=300, max_depth=8, random_state=42
)

# Ensemble Voting Classifier
ensemble = VotingClassifier(
    estimators=[('xgb', xgb), ('lgb', lgb), ('cat', cat), ('rf', rf)],
    voting='soft'  # 'soft' usually performs better than 'hard'
)

# Train
ensemble.fit(X_train, y_train)

# Predict
y_pred = ensemble.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nEnsemble Model Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Loaded
Dataset size: 1190 rows × 12 columns

Column Names: ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'target']

First 5 rows:
   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98   



In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

X = df.drop(columns=["target"])
y = df["target"]

# Train and test on the SAME data (overfitting)
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X, y)
y_pred = model.predict(X)

acc = accuracy_score(y, y_pred)
print(f"Accuracy (Overfit): {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y, y_pred))


Accuracy (Overfit): 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       561
           1       1.00      1.00      1.00       629

    accuracy                           1.00      1190
   macro avg       1.00      1.00      1.00      1190
weighted avg       1.00      1.00      1.00      1190



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Polynomial features (can improve non-linear separability)
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Handle class imbalance
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)

# Base models
rf = RandomForestClassifier(n_estimators=500, max_depth=15, random_state=42)
et = ExtraTreesClassifier(n_estimators=500, max_depth=15, random_state=42)
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
hgb = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.05, max_depth=6, random_state=42)
xgb = XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, eval_metric="logloss", random_state=42)
lgb = LGBMClassifier(n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=42)
cat = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.05, verbose=0, random_state=42)

# Voting ensemble (hard voting for classification)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('et', et),
        ('gb', gb),
        ('hgb', hgb),
        ('xgb', xgb),
        ('lgb', lgb),
        ('cat', cat)
    ],
    voting='hard',  # Can change to 'soft' if you want probability-based voting
    n_jobs=-1
)

# Train
voting_clf.fit(X_train, y_train)

# Predict
y_pred = voting_clf.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nFinal Voting Ensemble Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Final Voting Ensemble Accuracy: 0.9365

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       126
           1       0.92      0.95      0.94       126

    accuracy                           0.94       252
   macro avg       0.94      0.94      0.94       252
weighted avg       0.94      0.94      0.94       252



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Feature expansion (polynomial)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Handle imbalance
sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)

# Base models
xgb = XGBClassifier(
    n_estimators=800, max_depth=6, learning_rate=0.03, subsample=0.9,
    colsample_bytree=0.9, gamma=0.2, reg_lambda=1, eval_metric="logloss", random_state=42
)

lgb = LGBMClassifier(
    n_estimators=800, max_depth=6, learning_rate=0.03, subsample=0.9,
    colsample_bytree=0.9, reg_lambda=1, random_state=42
)

cat = CatBoostClassifier(
    iterations=800, depth=6, learning_rate=0.03, l2_leaf_reg=3,
    subsample=0.9, verbose=0, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=800, max_depth=12, min_samples_split=2, min_samples_leaf=1, random_state=42
)

et = ExtraTreesClassifier(
    n_estimators=800, max_depth=12, min_samples_split=2, min_samples_leaf=1, random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=800, learning_rate=0.03, max_depth=6, random_state=42
)

# Stacking model
stacking = StackingClassifier(
    estimators=[('xgb', xgb), ('lgb', lgb), ('cat', cat), ('rf', rf), ('et', et), ('gb', gb)],
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True
)

# Soft voting with stacking inside
ensemble = VotingClassifier(
    estimators=[('stack', stacking), ('rf', rf), ('et', et), ('gb', gb)],
    voting='soft'
)

# Train
ensemble.fit(X_train, y_train)

# Predict
y_pred = ensemble.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nFinal Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 503, number of negative: 503
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5620
[LightGBM] [Info] Number of data points in the train set: 1006, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 402, number of negative: 402
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000857 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5254
[LightGBM] [Info] Number of data points in the train set: 804, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 402, number of negative: 403
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the over

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/internshipProject/heart_disease/Heart Disease/dataset.csv")

# Features & target
X = df.drop(columns=["target"])
y = df["target"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle imbalance
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)

# Base models
xgb = XGBClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, random_state=42,
    use_label_encoder=False, eval_metric='logloss'
)
lgbm = LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=-1, random_state=42)
rf = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)
gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)

# Stacking Ensemble
stack = StackingClassifier(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('rf', rf), ('gb', gb)],
    final_estimator=RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42),
    cv=5,
    passthrough=True
)

# Train
stack.fit(X_train, y_train)

# Predict
y_pred = stack.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nFinal Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 503, number of negative: 503
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 1006, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 402, number of negative: 402
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 402
[LightGBM] [Info] Number of data points in the train set: 804, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 402, number of negative: 403
[Lig

In [None]:
import joblib

# Save the trained stacking model
joblib.dump(stack, "/content/drive/MyDrive/internshipProject/heart_disease/heart_disease_stack_model.pkl")

# Save the scaler
joblib.dump(scaler, "/content/drive/MyDrive/internshipProject/heart_disease/scaler.pkl")

print("✅ Model and Scaler saved successfully!")


✅ Model and Scaler saved successfully!


In [None]:
import pandas as pd
import joblib
import random

model = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/heart_disease_stack_model.pkl")
scaler = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/scaler.pkl")

# Function to generate random patient data
def generate_random_input():
    return [
        random.randint(29, 77),                 # age
        random.randint(0, 1),                   # sex
        random.randint(1, 4),                   # chest pain type
        random.randint(90, 200),                # resting bp s
        random.randint(126, 564),               # cholesterol
        random.randint(0, 1),                   # fasting blood sugar
        random.randint(0, 2),                   # resting ecg
        random.randint(71, 202),                 # max heart rate
        random.randint(0, 1),                   # exercise angina
        round(random.uniform(0.0, 6.2), 1),     # oldpeak
        random.randint(0, 2)                    # ST slope
    ]

# Generate and scale random input
random_data = generate_random_input()
scaled_data = scaler.transform([random_data])

# Prediction
prediction = model.predict(scaled_data)[0]

print("\nRandom Patient Data:", random_data)
if prediction == 1:
    print(" High risk of heart disease")
else:
    print(" Low risk of heart disease")



Random Patient Data: [51, 1, 3, 193, 148, 1, 2, 102, 1, 0.5, 1]
 High risk of heart disease


In [None]:
import pandas as pd
import joblib
import random
import os

# Load model and scaler
model = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/heart_disease_stack_model.pkl")
scaler = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/scaler.pkl")

# Path to CSV log file
log_file = "/content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv"

# Function to generate random patient data
def generate_random_input():
    return [
        random.randint(29, 77),                 # age
        random.randint(0, 1),                   # sex
        random.randint(1, 4),                   # chest pain type
        random.randint(90, 200),                # resting bp s
        random.randint(126, 564),               # cholesterol
        random.randint(0, 1),                   # fasting blood sugar
        random.randint(0, 2),                   # resting ecg
        random.randint(71, 202),                # max heart rate
        random.randint(0, 1),                   # exercise angina
        round(random.uniform(0.0, 6.2), 1),     # oldpeak
        random.randint(0, 2)                    # ST slope
    ]

# Generate and scale random input
random_data = generate_random_input()
scaled_data = scaler.transform([random_data])

# Prediction
prediction = model.predict(scaled_data)[0]
risk = "High risk" if prediction == 1 else "Low risk"

print("\nRandom Patient Data:", random_data)
print("Prediction:", risk)

# Convert to DataFrame with column names + prediction
columns = ['age', 'sex', 'chest pain type', 'resting bp s',
           'cholesterol', 'fasting blood sugar', 'resting ecg',
           'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'Prediction']

df_row = pd.DataFrame([random_data + [risk]], columns=columns)

# Append to CSV (create if doesn't exist)
if os.path.exists(log_file):
    df_row.to_csv(log_file, mode='a', header=False, index=False)
else:
    df_row.to_csv(log_file, mode='w', header=True, index=False)

print(f" Data saved to {log_file}")



Random Patient Data: [76, 1, 4, 164, 330, 1, 0, 106, 1, 0.6, 0]
Prediction: High risk
 Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv


In [None]:
import pandas as pd
import joblib
import random
import os
import time

# Load model and scaler
model = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/heart_disease_stack_model.pkl")
scaler = joblib.load("/content/drive/MyDrive/internshipProject/heart_disease/scaler.pkl")

# Path to CSV log file
log_file = "/content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv"

# Function to generate random patient data
def generate_random_input():
    return [
        random.randint(29, 77),                 # age
        random.randint(0, 1),                   # sex
        random.randint(1, 4),                   # chest pain type
        random.randint(90, 200),                # resting bp s
        random.randint(126, 564),               # cholesterol
        random.randint(0, 1),                   # fasting blood sugar
        random.randint(0, 2),                   # resting ecg
        random.randint(71, 202),                 # max heart rate
        random.randint(0, 1),                   # exercise angina
        round(random.uniform(0.0, 6.2), 1),     # oldpeak
        random.randint(0, 2)                    # ST slope
    ]

# Column names for CSV
columns = ['age', 'sex', 'chest pain type', 'resting bp s',
           'cholesterol', 'fasting blood sugar', 'resting ecg',
           'max heart rate', 'exercise angina', 'oldpeak', 'ST slope', 'Prediction']

# Real-time simulation loop
try:
    while True:
        # Generate and scale random input
        random_data = generate_random_input()
        scaled_data = scaler.transform([random_data])

        # Prediction
        prediction = model.predict(scaled_data)[0]
        risk = "High risk" if prediction == 1 else "Low risk"

        # Show in terminal
        print("\nRandom Patient Data:", random_data)
        print("Prediction:", risk)

        # Save to CSV
        df_row = pd.DataFrame([random_data + [risk]], columns=columns)
        if os.path.exists(log_file):
            df_row.to_csv(log_file, mode='a', header=False, index=False)
        else:
            df_row.to_csv(log_file, mode='w', header=True, index=False)

        print(f"Data saved to {log_file}")

        # Wait before next prediction
        time.sleep(5)  # 5 seconds delay

except KeyboardInterrupt:
    print("\n Real-time simulation stopped.")



Random Patient Data: [36, 0, 2, 191, 474, 1, 2, 87, 1, 2.4, 1]
Prediction: High risk
Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv

Random Patient Data: [62, 1, 1, 177, 495, 0, 2, 139, 1, 0.7, 1]
Prediction: High risk
Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv

Random Patient Data: [39, 1, 1, 182, 494, 1, 2, 116, 0, 5.4, 1]
Prediction: High risk
Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv

Random Patient Data: [69, 0, 2, 137, 516, 0, 2, 71, 1, 3.0, 0]
Prediction: High risk
Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv

Random Patient Data: [52, 1, 2, 97, 249, 0, 0, 195, 0, 6.1, 2]
Prediction: Low risk
Data saved to /content/drive/MyDrive/internshipProject/heart_disease/prediction_log.csv

Random Patient Data: [37, 0, 4, 160, 210, 1, 2, 179, 0, 5.8, 2]
Prediction: High risk
Data saved to /content/drive/MyDrive/inter