In [196]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

# Preprocessing Data

In [197]:
# Load the dataset (assuming it's in a CSV file format)
file_path = '..\\data\\cardio.csv'
df = pd.read_csv(file_path)

df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,target
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


## Separate Features

In [198]:
# Assuming 'cardio' is the target variable
X = df.drop(columns=['target'])
X

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0


## Separate Target

In [199]:
y = df['target']
y

0        0
1        1
2        1
3        1
4        0
        ..
69995    0
69996    1
69997    1
69998    1
69999    0
Name: target, Length: 70000, dtype: int64

In [200]:
# Keeping the index for later use
indices = df.index.tolist()

## Split Data into 70-30 ratio

In [201]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X, y, indices, test_size=0.2, random_state=42
)

# Necessary classes for the process

## ModelTrainer class
it is built to train individual models until it gives accuracy atleast equals to the threshold

In [204]:
class ModelTrainer:
    def __init__(self, X, y, threshold, model_number):
        self.X = X
        self.y = y
        self.threshold = threshold
        self.model_number = model_number  # Keep track of the model number
        self.model = None
        self.trained_indices = []  # To store indices of correctly trained samples
        self.X_baad = pd.DataFrame()  # DataFrame for incorrectly predicted samples
        self.y_baad = pd.Series()  # Series for incorrectly predicted labels

    def train(self):
        while True:  # Continue until the model meets the accuracy threshold
            # Create a new instance of LogisticRegression for each training iteration
            self.model = LogisticRegression(max_iter=1000)
            self.model.fit(self.X, self.y)

            # Calculate accuracy on the training data
            y_pred = self.model.predict(self.X)
            accuracy = accuracy_score(self.y, y_pred)
            print(f"Model {self.model_number} Accuracy: {accuracy:.4f} on {len(self.X)} data")
            
            self.separate_data(y_pred)
            
            # Check if the accuracy meets the threshold
            if accuracy >= self.threshold:
                print(f"Model {self.model_number} reached threshold of {self.threshold}.")
                break  # Exit the loop if accuracy meets the threshold
        print(f'X_baad size: {len(self.X_baad)}')
        return accuracy  # Return accuracy
    
    
    def separate_data(self, y_pred):
        # Append incorrectly predicted samples to self.X_baad and self.y_baad
        X_incorrect = self.X[y_pred != self.y]
        y_incorrect = self.y[y_pred != self.y]
        self.X_baad = pd.concat([self.X_baad, X_incorrect], axis=0)
        self.y_baad = pd.concat([self.y_baad, y_incorrect], axis=0)
        # Store indices for samples that this model trained on correctly
        self.trained_indices = self.X.index[y_pred == self.y].tolist()

        # Exclude incorrectly predicted samples from the training data
        self.X = self.X[y_pred == self.y]
        self.y = self.y[y_pred == self.y]

## Trainer class
This class is to train multiple models with the given dataset

In [205]:
class Trainer:
    def __init__(self, X, y, threshold, cutoff):
        self.models = []
        self.X = X
        self.y = y
        self.threshold = threshold
        self.cutoff = cutoff
        self.training_tracker = {}  # Dictionary to store which model trained on each sample

    def train(self, num=10):
        for i in range(num):
            print(f"Training model {i + 1}/{num}")
            model = ModelTrainer(self.X, self.y, self.threshold, i + 1)
            
            # Train the model
            try:
                accuracy = model.train()
            except Exception as e:
                print(f"Error during training: {e}")
                break

            # Track which model trained each index
            for idx in model.trained_indices:
                if idx in self.training_tracker:
                    self.training_tracker[idx].append(i + 1)  # Append model number
                else:
                    self.training_tracker[idx] = [i + 1]

            # Update the training data
            self.X = model.X_baad
            self.y = model.y_baad
            self.models.append(model.model)

            print(f"Model {i + 1} trained. Remaining data: {len(self.X)} samples.")

            # Check if we reached the cutoff
            if len(self.X) < self.cutoff:
                print("Cutoff reached. Stopping training.")
                break

        print(f"Training completed. Total models trained: {len(self.models)}.")

In [None]:
# trains model until it reaches atleast 95% accuracy by excluding data
threshold = 0.95
# no more training will take place is remaining dataset size reduces to 0.1% main dataset size
cutoff = len(X_train) * 0.01 * 0.1 

In [206]:
# Training phase
trainer = Trainer(X_train, y_train, threshold, cutoff)  # Adjust threshold and cutoff as needed
trainer.train()

Training model 1/10
Model 1 Accuracy: 0.7055 on 56000 data
Model 1 Accuracy: 0.9969 on 39510 data
Model 1 reached threshold of 0.95.
X_baad size: 16613
Model 1 trained. Remaining data: 16613 samples.
Training model 2/10
Model 2 Accuracy: 0.9898 on 16613 data
Model 2 reached threshold of 0.95.
X_baad size: 170
Model 2 trained. Remaining data: 170 samples.
Training model 3/10
Model 3 Accuracy: 0.9000 on 170 data
Model 3 Accuracy: 0.9739 on 153 data
Model 3 reached threshold of 0.95.
X_baad size: 21
Model 3 trained. Remaining data: 21 samples.
Cutoff reached. Stopping training.
Training completed. Total models trained: 3.


In [207]:
import re

class Predictor:
    def __init__(self, models, X_test, y_test, test_indices, file_path):
        self.models = models
        self.X_test = X_test
        self.y_test = y_test
        self.test_indices = test_indices
        self.file_path = file_path
        self.results_df = None
         

    def predict(self):
        results = []  # Temporary list to build the DataFrame
        y_pred_final = []  # List to store final predictions for accuracy metrics

        for index, x, y in zip(self.test_indices, self.X_test.to_numpy(), self.y_test):  # Using .to_numpy() to ensure we get the array
            row = {"Index": index}  # Initialize row with index and expected output
            probas = [model.predict_proba(x.reshape(1, -1))[0] for model in self.models]  # Probabilities for each model
            
            # Collect probabilities with model-specific naming
            for i, p in enumerate(probas):
                row[f"model{i}"] = (p[0], p[1])  # Tuple of probabilities (prob_0, prob_1) for each model
            
            # Calculate confidence and final prediction
            confidence = [abs(p[1] - 0.5) for p in probas]
            max_confidence_index = 0
            maxx = -1
            for i, p in enumerate(probas):
                if maxx > p[1]:
                    maxx = max(maxx, p[1])
                    max_confidence_index = i
            
            verdict = 1 if probas[max_confidence_index][1] > 0.5 else 0
            row["Expected"] = self.y_test[index]
            row["Prediction"] = verdict
            row["matches?"] = "YES" if verdict == y else "NO"

            y_pred_final.append(row["Prediction"])  # Append final prediction for metrics calculation
            
            results.append(row)

        self.results_df = pd.DataFrame(results)

        # Calculate and display testing metrics
        accuracy = accuracy_score(self.y_test, y_pred_final)
        report = classification_report(self.y_test, y_pred_final, output_dict=True)
        print(f"\nTesting Accuracy: {accuracy}")
        
        strs = re.split(r"[\\/\.]", file_path)
        # print("\nDetailed Classification Report:\n", pd.DataFrame(report).transpose())

        # Save results to CSV excluding "Trained_in_Model" column
        self.results_df.drop(columns=["Trained_in_Model"], errors="ignore", inplace=True)
        self.results_df.to_excel(f"..\\outputs\\{strs[-2]}_predictions_results.xlsx", index=False)
        print(f"Results saved to \\outputs\\{strs[-2]}_predictions_results.xlsx")

In [208]:
# Prediction phase
predictor = Predictor(trainer.models, X_test, y_test, test_indices, file_path)
predictor.predict()


Testing Accuracy: 0.7140714285714286
Results saved to \outputs\cardio_predictions_results.xlsx
