# Heart Arrythmia

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
from dateutil import parser
from dateutil.parser import parse

# Load your dataset from the input CSV file
input_file = r'..Arrhythmia\Data\Seconds\heartrate_seconds_merged.csv'
output_file = r'..Arrhythmia\Data\Minutes\minuteHeartrate.csv'

df = pd.read_csv(input_file)

# Function to parse timestamps with multiple formats
def parse_timestamp(timestamp):
    try:
        return parser.parse(timestamp)
    except parser.ParserError:
        # If the timestamp format is not recognized, return NaN (or handle as needed)
        return pd.NaT

# Apply the parse_timestamp function to the "Time" column
df['Time'] = df['Time'].apply(parse_timestamp)

# Drop rows with NaN values in the "Time" column (optional)
df = df.dropna(subset=['Time'])

# Group by minute and calculate the average heart rate
result = df.groupby(['Id', pd.Grouper(key='Time', freq='1Min')])['Value'].mean().reset_index()

# Round off the 'Value' column to the nearest whole number
result['Value'] = result['Value'].round().astype(int)

# Save the result to the specified output CSV file
result.to_csv(output_file, index=False)

In [4]:
# Load your heart rate dataset into a DataFrame (replace 'heart_rate.csv' with your actual file name)
heart_rate_df = pd.read_csv(r'..Arrhythmia\Data\Minutes\minuteheartrate.csv')

# Standardize the date format in the 'Time' column using dateutil.parser
heart_rate_df['Time'] = heart_rate_df['Time'].apply(lambda x: parse(x))

# Load your minute sleep dataset into another DataFrame (replace 'minute_sleep.csv' with your actual file name)
minute_sleep_df = pd.read_csv(r'..Arrhythmia\Data\Minutes\minuteSleep.csv')

# Standardize the date format in the 'Time' column in the minute sleep DataFrame
minute_sleep_df['Time'] = minute_sleep_df['Time'].apply(lambda x: parse(x))

# Create a new column 'Sleep' in the heart rate DataFrame and initialize it to 0
heart_rate_df['Sleep'] = 0

# Iterate through the heart rate DataFrame and check for matching 'Id' and 'Time' in the minute sleep DataFrame
for index, row in heart_rate_df.iterrows():
    match_condition = (minute_sleep_df['Id'] == row['Id']) & (minute_sleep_df['Time'] == row['Time'])
    if match_condition.any():
        heart_rate_df.at[index, 'Sleep'] = 1

In [7]:
# Create a new column "Abheartrate" with default value 1
heart_rate_df['Abheartrate'] = 1

# Define the conditions and update the "Abheartrate" column accordingly
# Condition 1: Heart rate between 60 and 100 and sleep is 0
condition1 = (heart_rate_df['Value'] >= 60) & (heart_rate_df['Value'] <= 100) & (heart_rate_df['Sleep'] == 0)

# Condition 2: Heart rate between 40 and 50 and sleep is 1
condition2 = (heart_rate_df['Value'] >= 40) & (heart_rate_df['Value'] <= 50) & (heart_rate_df['Sleep'] == 1)

# Set the "Abheartrate" column to 1 where either condition 1 or condition 2 is met
heart_rate_df.loc[condition1 | condition2, 'Abheartrate'] = 0

In [8]:
from dateutil.parser import parse
# Read the original dataset from the CSV file
df = pd.read_csv(r"..Arrhythmia\Intensities\minuteIntensitiesWide_merged.csv")

# Initialize an empty list to store the transformed data
transformed_data = []

# Iterate through the rows of the filtered dataset
for _, row in df.iterrows():
    id_value = row['Id']
    activity_hour = parse(row['ActivityHour'], fuzzy=True)
    
    for minute in range(60):
        minute_str = f'Intensity{minute:02d}'
        intensity_value = row[minute_str]
        
        # Create a new timestamp for each minute
        minute_timestamp = activity_hour + pd.Timedelta(minutes=minute)
        
        # Append a row for each minute with the corresponding "ActivityHour" and "Intensity"
        if intensity_value != 0:  # Only include rows with non-zero intensity
            transformed_data.append([id_value, minute_timestamp, intensity_value])

# Create a new DataFrame with the transformed data
transformed_df = pd.DataFrame(transformed_data, columns=['Id', 'Time', 'Intensity'])

In [10]:
# Read the first dataset
df1 = transformed_df

# Read the second dataset
df2 = heart_rate_df

# Merge the datasets based on both "Id" and "Time"
merged_df = pd.merge(df2, df1, on=["Id", "Time"], how="left")

# Reorder columns to place "Intensity" before "Abheartrate"
merged_df = merged_df[["Id", "Time", "Value", "Sleep", "Intensity", "Abheartrate"]]

# Fill blanks in the "Intensity" column with zero
merged_df["Intensity"].fillna(0, inplace=True)

# Make anything greater than zero in the "Intensity" column as 1
merged_df["Intensity"] = (merged_df["Intensity"] > 0).astype(int)

# Set "Abheartrate" values to 0 where "Intensity" is 1
merged_df.loc[merged_df["Intensity"] == 1, "Abheartrate"] = 0

# Rename columns
merged_df = merged_df.rename(columns={'Value': 'Bpm', 'Abheartrate': 'Arrhythmia'})

# Save the merged dataset to a new CSV file
merged_df.to_csv(r"..Arrhythmia\Data\heartrate_final.csv", index=False)

## Model Implementation

### Hyperparameter tuning

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv(r"..Arrhythmia\Data\heartrate_final.csv") 

# Prepare the features (X) and target variable (y)
X = data[["Bpm", "Sleep", "Intensity"]]
y = data["Arrhythmia"]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_


In [14]:
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 1.0


### Model Training

In [16]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
data = pd.read_csv(r"..Arrhythmia\Data\heartrate_final.csv") 

# Drop the 'Time' and 'Id' column
data = data.drop(columns=["Id", 'Time'])

# Prepare the features (X) and target variable (y)
X = data[["Bpm", "Sleep", "Intensity"]]
y = data["Arrhythmia"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=42, class_weight = 'balanced')

# Train the Random Forest Classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data for Arrhythmia
y_pred = rf_classifier.predict(X_test)

# Evaluate the model for Arrhythmia
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results for Arrhythmia
print("Arrhythmia Model - Accuracy: ", accuracy)
print("Arrhythmia Model - Confusion Matrix:\n", confusion)
print("Arrhythmia Model - Classification Report:\n", report)

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


Arrhythmia Model - Accuracy:  1.0
Arrhythmia Model - Confusion Matrix:
 [[49541     0]
 [    0 17143]]
Arrhythmia Model - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49541
           1       1.00      1.00      1.00     17143

    accuracy                           1.00     66684
   macro avg       1.00      1.00      1.00     66684
weighted avg       1.00      1.00      1.00     66684

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Load your dataset
data = pd.read_csv(r"..Arrhythmia\Data\heartrate_final.csv") 

# Drop the 'Time' and 'Id' column
data = data.drop(columns=["Id", 'Time'])

# Prepare the features (X) and target variable (y)
X = data[["Bpm", "Sleep", "Intensity"]]
y = data["Arrhythmia"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a dictionary of classifiers to compare
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

# Iterate over each classifier
for name, classifier in classifiers.items():
    print("Training", name)
    
    # Train the classifier
    classifier.fit(X_train, y_train)

    # Make predictions on the test data for Arrhythmia
    y_pred = classifier.predict(X_test)

    # Evaluate the model for Arrhythmia
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Print the evaluation results for Arrhythmia
    print("Model - Accuracy:", accuracy)
    print("Model - Confusion Matrix:\n", confusion)
    print("Model - Classification Report:\n", report)

    # Perform cross-validation
    cv_scores = cross_val_score(classifier, X, y, cv=5)  # 5-fold cross-validation
    print("Cross-Validation Scores:", cv_scores)
    print("Mean CV Accuracy:", np.mean(cv_scores))
    print("\n")


Training Random Forest
Model - Accuracy: 1.0
Model - Confusion Matrix:
 [[49541     0]
 [    0 17143]]
Model - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49541
           1       1.00      1.00      1.00     17143

    accuracy                           1.00     66684
   macro avg       1.00      1.00      1.00     66684
weighted avg       1.00      1.00      1.00     66684

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0


Training MLP
Model - Accuracy: 1.0
Model - Confusion Matrix:
 [[49541     0]
 [    0 17143]]
Model - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49541
           1       1.00      1.00      1.00     17143

    accuracy                           1.00     66684
   macro avg       1.00      1.00      1.00     66684
weighted avg       1.00      1.00      1.00     66684

Cross-Validation Sco

100 % accuracy in random forest not useful for real world applications

In [3]:
# Logistic Regression Algorithm

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
data = pd.read_csv(r"..Arrhythmia\Data\heartrate_final.csv")

# Drop unnecessary columns
data = data.drop(columns=["Id", 'Time'])

# Prepare the features (X) and target variable (y)
X = data[["Bpm", "Sleep", "Intensity"]]
y = data["Arrhythmia"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation results
print("Logistic Regression Model - Accuracy: ", accuracy)
print("Logistic Regression Model - Confusion Matrix:\n", confusion)
print("Logistic Regression Model - Classification Report:\n", report)


Logistic Regression Model - Accuracy:  0.8888488992861856
Logistic Regression Model - Confusion Matrix:
 [[48504  1037]
 [ 6375 10768]]
Logistic Regression Model - Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93     49541
           1       0.91      0.63      0.74     17143

    accuracy                           0.89     66684
   macro avg       0.90      0.80      0.84     66684
weighted avg       0.89      0.89      0.88     66684



### Gradient Descent With Regularization

In [4]:
def compute_cost_logistic_reg(X, y, w, b, lambda_ = 1):
    m,n  = X.shape
    cost = 0.
    for i in range(m):
        z_i = np.dot(X[i], w) + b                                      #(n,)(n,)=scalar, see np.dot
        f_wb_i = sigmoid(z_i)                                          #scalar
        cost +=  -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)      #scalar
             
    cost = cost/m                                                      #scalar

    reg_cost = 0
    for j in range(n):
        reg_cost += (w[j]**2)                                          #scalar
    reg_cost = (lambda_/(2*m)) * reg_cost                              #scalar
   
    total_cost = cost + reg_cost                                       #scalar
    return total_cost                                                  #scalar

def compute_gradient_logistic_reg(X, y, w, b, lambda_): 
    m,n = X.shape
    dj_dw = np.zeros((n,))                            #(n,)
    dj_db = 0.0                                       #scalar

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)          #(n,)(n,)=scalar
        err_i  = f_wb_i  - y[i]                       #scalar
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j]      #scalar
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m                                   #(n,)
    dj_db = dj_db/m                                   #scalar

    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lambda_/m) * w[j]

    return dj_db, dj_dw  

In [5]:
# Gradient descent function
def gradient_descent_reg(X, y, w_in, b_in, alpha, num_iters,lambda_): 
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):
        # Calculate the gradient and update the parameters
        dj_db, dj_dw = compute_gradient_logistic_reg(X, y, w, b,lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( compute_cost_logistic_reg(X, y, w, b) )

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         #return final w,b and J history for graphing

In [None]:
import math
import copy
from scipy.special import expit as sigmoid

# Convert DataFrame to numpy arrays for computation
X_train_np = X_train.values
y_train_np = y_train.values

# Initialize parameters

num_iters = 1000  # Number of iterations
initial_w = np.zeros(X_train.shape[1])  # Initialize weights
initial_b = 0  # Initialize bias

alpha = 0.001  # Reduced learning rate
lambda_tmp = 0.1  # Reduced regularization parameter

# Apply gradient descent
final_w, final_b, J_history = gradient_descent_reg(X_train_np, y_train_np, initial_w, initial_b, alpha, num_iters,lambda_tmp)

# Print the final weights and bias
print("Final weights:", final_w)
print("Final bias:", final_b)

# Evaluate the model on test data (For simplicity, we will only use the final weights and bias without iterating further)
y_pred = (sigmoid(np.dot(X_test.values, final_w) + final_b) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation results
print("\nGradient Descent Logistic Regression Model - Accuracy: ", accuracy)
print("Gradient Descent Logistic Regression Model - Confusion Matrix:\n", confusion)
print("Gradient Descent Logistic Regression Model - Classification Report:\n", report)


### Gradient Descent Without Regularization

In [8]:
def compute_gradient_logistic(X, y, w, b): 
    m,n = X.shape
    dj_dw = np.zeros((n,))                           #(n,)
    dj_db = 0.

    for i in range(m):
        f_wb_i = sigmoid(np.dot(X[i],w) + b)          #(n,)(n,)=scalar
        err_i  = f_wb_i  - y[i]                       #scalar
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j]      #scalar
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m                                   #(n,)
    dj_db = dj_db/m                                   #scalar
        
    return dj_db, dj_dw  

def compute_cost_logistic(X, y, w, b):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    cost = -1/m * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [9]:
def gradient_descent(X, y, w_in, b_in, alpha, num_iters): 
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):
        # Calculate the gradient and update the parameters
        dj_db, dj_dw = compute_gradient_logistic(X, y, w, b)   

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( compute_cost_logistic(X, y, w, b) )

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
        
    return w, b, J_history         #return final w,b and J history for graphing

In [None]:
import math
import copy

# Initialize parameters
alpha = 0.0001  # Learning rate
num_iters = 1000  # Number of iterations
initial_w = np.zeros(X_train.shape[1])  # Initialize weights
initial_b = 0  # Initialize bias

# Convert DataFrame to numpy arrays for computation
X_train_np = X_train.values
y_train_np = y_train.values

# Apply gradient descent
final_w, final_b, J_history = gradient_descent(X_train_np, y_train_np, initial_w, initial_b, alpha, num_iters)

# Print the final weights and bias
print("Final weights:", final_w)
print("Final bias:", final_b)

# Evaluate the model on test data (For simplicity, we will only use the final weights and bias without iterating further)
y_pred = (sigmoid(np.dot(X_test.values, final_w) + final_b) > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print evaluation results
print("\nGradient Descent Logistic Regression Model - Accuracy: ", accuracy)
print("Gradient Descent Logistic Regression Model - Confusion Matrix:\n", confusion)
print("Gradient Descent Logistic Regression Model - Classification Report:\n", report)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=50000, random_state=42)

# Fit the model
logreg.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = logreg.predict(X_test)

# Calculate accuracy and other metrics
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print(classification_report(y_test, y_pred))

# Calculate and print ROC AUC score
y_prob = logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f'ROC AUC Score: {roc_auc:.2f}')

# Define hyperparameters grid for GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga']  # Optimization algorithm
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=LogisticRegression(max_iter=1000, random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',
                           verbose=1)

# Perform GridSearchCV
grid_search.fit(X_train, y_train)

# Best hyperparameters and accuracy
print(f'Best Hyperparameters: {grid_search.best_params_}')
print(f'Best Accuracy: {grid_search.best_score_ * 100:.2f}%')

# Evaluate the model with best hyperparameters on the test set
best_logreg = grid_search.best_estimator_
y_pred_best = best_logreg.predict(X_test)

# Calculate accuracy and other metrics with best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best * 100:.2f}%')

# Print classification report for best model
print(classification_report(y_test, y_pred_best))

# Calculate and print ROC AUC score for best model
y_prob_best = best_logreg.predict_proba(X_test)[:, 1]
roc_auc_best = roc_auc_score(y_test, y_prob_best)
print(f'Best Model ROC AUC Score: {roc_auc_best:.2f}')

### Regularization

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming you have already loaded and preprocessed your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Logistic Regression model with L1 regularization (Lasso)
# C is the inverse of regularization strength; smaller values specify stronger regularization
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)
logreg_l1.fit(X_train, y_train)

# Make predictions
y_pred_l1 = logreg_l1.predict(X_test)

# Evaluate the model
accuracy_l1 = accuracy_score(y_test, y_pred_l1)
print("Accuracy with L1 regularization:", accuracy_l1)


In [None]:
# Create and train a Logistic Regression model with L2 regularization (Ridge)
# C is the inverse of regularization strength; smaller values specify stronger regularization
logreg_l2 = LogisticRegression(penalty='l2', C=1.0, random_state=42)
logreg_l2.fit(X_train, y_train)

# Make predictions
y_pred_l2 = logreg_l2.predict(X_test)

# Evaluate the model
accuracy_l2 = accuracy_score(y_test, y_pred_l2)
print("Accuracy with L2 regularization:", accuracy_l2)


# Using Scikit learn (Best)

### Using balanced class weight because the classes are imbalanced

In [12]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
data = pd.read_csv(r"..Arrhythmia\Data\heartrate_final.csv")
# Drop unnecessary columns
data = data.drop(columns=["Id", 'Time'])
# Prepare the features (X) and target variable (y)
X = data[["Bpm", "Sleep", "Intensity"]]
y = data["Arrhythmia"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Initialize and train the logistic regression model
lr_model = LogisticRegression(class_weight='balanced') 
lr_model.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(lr_model, X, y, cv=5)  # 5-fold cross-validation

# Predict on the test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)
# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Accuracy Score: 0.9351118709135625
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     49568
           1       0.81      0.97      0.88     17116

    accuracy                           0.94     66684
   macro avg       0.90      0.95      0.92     66684
weighted avg       0.94      0.94      0.94     66684

Confusion Matrix:
[[45732  3836]
 [  491 16625]]
Cross-Validation Scores: [0.97242217 0.96264471 0.880781   0.97024774 0.95150261]
Mean CV Accuracy: 0.9475196448923281


In [12]:
import pickle

# Specify the full path to save the pickle file
file_path = r'..Webpage\model\Ha_lr.pickle'

# Save the model to a pickle file
with open(file_path, 'wb') as f:
    pickle.dump(lr_model, f)


In [13]:
import pandas as pd

# Create a DataFrame with the new data
new_data = pd.DataFrame({"Bpm": [58], "Sleep": [0], "Intensity": [0]})

# Make predictions using the trained logistic regression model
predictions = lr_model.predict(new_data)

# Display the predictions
print("Predicted class for the new data:", predictions)


Predicted class for the new data: [1]
