In [181]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

# Preprocessing Data

In [182]:
# Load the dataset (assuming it's in a CSV file format)
file_path = '..\\data\\heart-2.csv'
df = pd.read_csv(file_path)

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


## Separate Features

In [183]:
# Assuming 'cardio' is the target variable
X = df.drop(columns=['target'])
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


## Separate Target

In [184]:
y = df['target']
y

0       0
1       0
2       0
3       0
4       0
       ..
1020    1
1021    0
1022    0
1023    1
1024    0
Name: target, Length: 1025, dtype: int64

In [185]:
# Keeping the index for later use
indices = df.index.tolist()

## Split Data into 70-30 ratio

In [186]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X, y, indices, test_size=0.2, random_state=42
)

In [187]:
pd.concat([X_train, y_train], axis=1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
835,49,1,2,118,149,0,0,126,0,0.8,2,3,2,0
137,64,0,0,180,325,0,1,154,1,0.0,2,0,2,1
534,54,0,2,108,267,0,0,167,0,0.0,2,0,2,1
495,59,1,0,135,234,0,1,161,0,0.5,1,0,3,1
244,51,1,2,125,245,1,0,166,0,2.4,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,41,1,2,130,214,0,0,168,0,2.0,1,0,2,1
71,61,1,0,140,207,0,0,138,1,1.9,2,1,3,0
106,51,1,0,140,299,0,1,173,1,1.6,2,0,3,0
270,43,1,0,110,211,0,1,161,0,0.0,2,0,3,1


In [188]:
# trains model until it reaches atleast 95% accuracy by excluding data
threshold = 0.95
# no more training will take place is remaining dataset size reduces to 0.1% main dataset size
cutoff = len(X_train) * 0.01 * 0.1 

In [189]:
class TrainerModel:
    def __init__(self, X, y, threshold, model_number):
        self.X = X
        self.y = y
        self.threshold = threshold
        self.model_number = model_number  # Keep track of the model number
        self.model = None
        self.trained_indices = []  # To store indices of correctly trained samples
        self.X_baad = pd.DataFrame()  # DataFrame for incorrectly predicted samples
        self.y_baad = pd.Series()  # Series for incorrectly predicted labels

    def train(self):
        while True:  # Continue until the model meets the accuracy threshold
            # Create a new instance of LogisticRegression for each training iteration
            self.model = LogisticRegression(max_iter=1000)
            self.model.fit(self.X, self.y)

            # Calculate accuracy on the training data
            y_pred = self.model.predict(self.X)
            accuracy = accuracy_score(self.y, y_pred)
            print(f"Model {self.model_number} Accuracy: {accuracy:.4f} on {len(self.X)} data")
            
            self.separate_data(y_pred)
            
            # Check if the accuracy meets the threshold
            if accuracy >= self.threshold:
                print(f"Model {self.model_number} reached threshold of {self.threshold}.")
                break  # Exit the loop if accuracy meets the threshold
        print(f'X_baad size: {len(self.X_baad)}')
        return accuracy  # Return accuracy
    
    
    def separate_data(self, y_pred):
        # Append incorrectly predicted samples to self.X_baad and self.y_baad
        X_incorrect = self.X[y_pred != self.y]
        y_incorrect = self.y[y_pred != self.y]
        self.X_baad = pd.concat([self.X_baad, X_incorrect], axis=0)
        self.y_baad = pd.concat([self.y_baad, y_incorrect], axis=0)
        # Store indices for samples that this model trained on correctly
        self.trained_indices = self.X.index[y_pred == self.y].tolist()

        # Exclude incorrectly predicted samples from the training data
        self.X = self.X[y_pred == self.y]
        self.y = self.y[y_pred == self.y]

In [190]:
class Trainer:
    def __init__(self, X, y, threshold, cutoff):
        self.models = []
        self.X = X
        self.y = y
        self.threshold = threshold
        self.cutoff = cutoff
        self.training_tracker = {}  # Dictionary to store which model trained on each sample

    def train(self, num=10):
        for i in range(num):
            print(f"Training model {i + 1}/{num}")
            model = TrainerModel(self.X, self.y, self.threshold, i + 1)
            
            # Train the model
            try:
                accuracy = model.train()
            except Exception as e:
                print(f"Error during training: {e}")
                break

            # Track which model trained each index
            for idx in model.trained_indices:
                if idx in self.training_tracker:
                    self.training_tracker[idx].append(i + 1)  # Append model number
                else:
                    self.training_tracker[idx] = [i + 1]

            # Update the training data
            self.X = model.X_baad
            self.y = model.y_baad
            self.models.append(model.model)

            print(f"Model {i + 1} trained. Remaining data: {len(self.X)} samples.")

            # Check if we reached the cutoff
            if len(self.X) < self.cutoff:
                print("Cutoff reached. Stopping training.")
                break

        print(f"Training completed. Total models trained: {len(self.models)}.")

In [191]:
# Training phase
trainer = Trainer(X_train, y_train, threshold, cutoff)  # Adjust threshold and cutoff as needed
trainer.train()

Training model 1/10
Model 1 Accuracy: 0.8720 on 820 data
Model 1 Accuracy: 0.9888 on 715 data
Model 1 reached threshold of 0.95.
X_baad size: 113
Model 1 trained. Remaining data: 113 samples.
Training model 2/10
Model 2 Accuracy: 0.9469 on 113 data
Model 2 Accuracy: 1.0000 on 107 data
Model 2 reached threshold of 0.95.
X_baad size: 6
Model 2 trained. Remaining data: 6 samples.
Training model 3/10
Model 3 Accuracy: 1.0000 on 6 data
Model 3 reached threshold of 0.95.
X_baad size: 0
Model 3 trained. Remaining data: 0 samples.
Cutoff reached. Stopping training.
Training completed. Total models trained: 3.


In [192]:
import re

class Predictor:
    def __init__(self, models, X_test, y_test, test_indices, file_path):
        self.models = models
        self.X_test = X_test
        self.y_test = y_test
        self.test_indices = test_indices
        self.file_path = file_path
        self.results_df = None
         

    def predict(self):
        results = []  # Temporary list to build the DataFrame
        y_pred_final = []  # List to store final predictions for accuracy metrics

        for index, x, y in zip(self.test_indices, self.X_test.to_numpy(), self.y_test):  # Using .to_numpy() to ensure we get the array
            row = {"Index": index}  # Initialize row with index and expected output
            probas = [model.predict_proba(x.reshape(1, -1))[0] for model in self.models]  # Probabilities for each model
            
            # Collect probabilities with model-specific naming
            for i, p in enumerate(probas):
                row[f"model{i}"] = (p[0], p[1])  # Tuple of probabilities (prob_0, prob_1) for each model
            
            # Calculate confidence and final prediction
            confidence = [abs(p[1] - 0.5) for p in probas]
            max_confidence_index = 0
            maxx = -1
            for i, p in enumerate(probas):
                if maxx > p[1]:
                    maxx = max(maxx, p[1])
                    max_confidence_index = i
            
            verdict = 1 if probas[max_confidence_index][1] > 0.5 else 0
            row["Expected"] = self.y_test[index]
            row["Prediction"] = verdict
            row["matches?"] = "YES" if verdict == y else "NO"

            y_pred_final.append(row["Prediction"])  # Append final prediction for metrics calculation
            
            results.append(row)

        self.results_df = pd.DataFrame(results)

        # Calculate and display testing metrics
        accuracy = accuracy_score(self.y_test, y_pred_final)
        report = classification_report(self.y_test, y_pred_final, output_dict=True)
        print(f"\nTesting Accuracy: {accuracy}")
        
        strs = re.split(r"[\\/\.]", file_path)
        # print("\nDetailed Classification Report:\n", pd.DataFrame(report).transpose())

        # Save results to CSV excluding "Trained_in_Model" column
        self.results_df.drop(columns=["Trained_in_Model"], errors="ignore", inplace=True)
        self.results_df.to_excel(f"..\\outputs\\{strs[-2]}_predictions_results.xlsx", index=False)
        print(f"Results saved to \\outputs\\{strs[-2]}_predictions_results.xlsx")

In [193]:
# Prediction phase
predictor = Predictor(trainer.models, X_test, y_test, test_indices, file_path)
predictor.predict()


Testing Accuracy: 0.7951219512195122
Results saved to \outputs\csv_predictions_results.xlsx
