# Imports

In [399]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [400]:
import os
import sys
import logging
import traceback
import pandas as pd
import glob2 as glob
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from typing import Optional

In [401]:
BASE_DIR = Path.cwd()

In [402]:
# Helper 
from src.Helper.merger import MERGER
from src.Helper.filters import ApplyPreProccessingTask

# Data Pre-Processing
from src.DataPreProcessing.DataCleaning import CLEANER
from src.DataPreProcessing.DataHandleMissingValues import HandleMissingValues
from src.DataPreProcessing.DataEncodingCategoricalVariables import EncodingCategorical
from src.DataPreProcessing.DataFeatureScaling import FeatureScaler
from src.DataPreProcessing.DataFeatureEngineering import FeatureEngineering
from src.DataPreProcessing.BalancingDataset import DatasetBalancer
from src.DataPreProcessing.DataHandlingOutlier import OutlierDetector
from src.DataPreProcessing.DataFeatureSelection import FeatureSelector

# Exploratory Data Analysis(EDA)
from src.EDA.eda import EDA

# Hypothesis Tests
from src.HypothesisTesting.HypothesisTests import HypothesisTests
from src.HypothesisTesting.ValidateTests import ValidateHypothesisTesting

# Intial Set-Up

In [403]:
# Set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Clear existing handlers
logger.handlers = []

# File handler
file_handler = logging.FileHandler("logs.txt")
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# Disable propagation to avoid duplicate logs from parent loggers
logger.propagate = False

In [404]:
info = f"Searching for all csv files in Raw foler."
logger.info(info)
raw_csv_files = glob.glob("./src/Data/Raw/*.csv")

for index, file in enumerate(raw_csv_files):
    info = f"RAW FILE {str(index + 1).zfill(2)} {file}"
    logger.info(info)

2024-11-18 10:48:28,076 - __main__ - INFO - Searching for all csv files in Raw foler.
2024-11-18 10:48:28,078 - __main__ - INFO - RAW FILE 01 ./src/Data/Raw\heart_01.csv
2024-11-18 10:48:28,078 - __main__ - INFO - RAW FILE 02 ./src/Data/Raw\heart_02.csv
2024-11-18 10:48:28,080 - __main__ - INFO - RAW FILE 03 ./src/Data/Raw\heart_03.csv
2024-11-18 10:48:28,081 - __main__ - INFO - RAW FILE 04 ./src/Data/Raw\heart_04.csv


In [405]:
info = f"Loading all csv files."
logger.info(info)
merger = MERGER(logger=logger)

for index, file in enumerate(raw_csv_files):
    info = f"Loading file {file} to merger."
    logger.info(info)
    response = merger.load_csv_file(file)
    
    if response["success"]:
        info = f"File {file} to loaded successfully in MERGER."
        logger.info(info)
    else:
        info = f"Unable to load File-{file} in MERGER. Skipping..."
        logger.warning(info)

output_file = os.path.join(BASE_DIR, "src/Data/Merged/merged.csv")
info = f"Saving file to {output_file}"
logger.info(info)
response = merger.save_merged_data(output_file=output_file)

file = output_file

2024-11-18 10:48:28,860 - __main__ - INFO - Loading all csv files.
2024-11-18 10:48:28,862 - __main__ - INFO - Loading file ./src/Data/Raw\heart_01.csv to merger.
2024-11-18 10:48:28,874 - __main__ - INFO - [MERGER] Successfully loaded file: ./src/Data/Raw\heart_01.csv
2024-11-18 10:48:28,876 - __main__ - INFO - File ./src/Data/Raw\heart_01.csv to loaded successfully in MERGER.
2024-11-18 10:48:28,877 - __main__ - INFO - Loading file ./src/Data/Raw\heart_02.csv to merger.
2024-11-18 10:48:28,893 - __main__ - INFO - [MERGER] Successfully loaded file: ./src/Data/Raw\heart_02.csv
2024-11-18 10:48:28,894 - __main__ - INFO - File ./src/Data/Raw\heart_02.csv to loaded successfully in MERGER.
2024-11-18 10:48:28,895 - __main__ - INFO - Loading file ./src/Data/Raw\heart_03.csv to merger.
2024-11-18 10:48:28,909 - __main__ - INFO - [MERGER] Successfully loaded file: ./src/Data/Raw\heart_03.csv
2024-11-18 10:48:28,910 - __main__ - INFO - File ./src/Data/Raw\heart_03.csv to loaded successfully in

# Data Pre-Processing

## Cleaning

In [406]:
# Apply CLEANER
output_file = os.path.join(BASE_DIR, "src/Data/Cleaned/cleaned.csv")

ApplyPreProccessingTask(
    task = "Cleaning", 
    task_class = CLEANER, 
    sub_func = "clean_data", 
    sub_func_args = None, 
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Handling Missing Values

In [None]:
# Apply HandleMissingValues
output_file = os.path.join(BASE_DIR, "src/Data/Processed/handlemissingvalues.csv")

ApplyPreProccessingTask(
    task = "Missing Values", 
    task_class = HandleMissingValues, 
    sub_func = "handle_missing", 
    sub_func_args = {
        'strategy': {
            'age': 'median',
            'trestbps': 'mean',
            'chol': 'mean',
            'thalach': 'mean',
            'oldpeak': 'mean',
            'cp': 'most_frequent',
            'restecg': 'most_frequent',
            'slope': 'most_frequent',
            'ca': 'most_frequent',
            'thal': 'most_frequent'
        },
        'threshold': 0.5  # Drop if more than 50% values are missing
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Encoding Categorical

In [None]:
# Apply EncodingCategorical
output_file = os.path.join(BASE_DIR, "src/Data/Processed/processedencodeddata.csv")

ApplyPreProccessingTask(
    task = "Encoding Categorical", 
    task_class = EncodingCategorical, 
    sub_func = "encode_categorical", 
    sub_func_args = {
        'label_encode': ['cp', 'restecg'],
        'onehot_encode': ['slope', 'ca', 'thal'],
        'drop_first': True
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Feature Scaler Handler

In [None]:
# Apply FeatureScaler handler
output_file = os.path.join(BASE_DIR, "src/Data/Processed/scaled_data.csv")

ApplyPreProccessingTask(
    task = "Feature Scaler Handler", 
    task_class = FeatureScaler, 
    sub_func = "fit_transform", 
    sub_func_args = {
        'method' : "normalize"
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Feature Engineering Handler

In [None]:
# Apply Feature Engineering handler
output_file = os.path.join(BASE_DIR, "src/Data/Processed/engineered_data.csv")

ApplyPreProccessingTask(
    task = "Feature Engineering Handler", 
    task_class = FeatureEngineering, 
    sub_func = "encode_categorical_features", 
    sub_func_args = {
        'encoding_method': "one_hot"
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Dataset Balancer

In [None]:
# Apply Dataset Balancer
output_file = os.path.join(BASE_DIR, "src/Data/Processed/balanced_data.csv")

ApplyPreProccessingTask(
    task = "Dataset Balancer", 
    task_class = DatasetBalancer, 
    sub_func = "balance_data", 
    sub_func_args = {
        'method': "smote"
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Outlier Detector & Removal Handler

In [None]:
# Apply Outlier Detector handler
output_file = os.path.join(BASE_DIR, "src/Data/Processed/outliers_removed.csv")

ApplyPreProccessingTask(
    task = "Outlier Detector & Removal Handler", 
    task_class = OutlierDetector, 
    sub_func = "remove_outliers", 
    sub_func_args = None,
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

## Feature Selector Handler

In [None]:
# Apply Outlier Detector handler
output_file = os.path.join(BASE_DIR, "src/Data/Processed/selected_features.csv")

ApplyPreProccessingTask(
    task = "Feature Selector Handler", 
    task_class = FeatureSelector, 
    sub_func = "select_k_best", 
    sub_func_args = {
        'target': "target", 
        'k': 5
    },
    input_file = file, 
    output_file = output_file, 
    logger = logger
)

file = output_file

# Exploratory Data Analysis(EDA)

In [None]:
# Exploratory Data Analysis(EDA)
eda = EDA(logger=logger)
graphs_dir = os.path.join(BASE_DIR, "src/Graphs/")
eda.set_graphs_dir(graphs_dir)
eda.load_data(file)
eda.run_all()

In [None]:
# Load the uploaded dataset
file_path = './src/Data/Merged/merged.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary info
data.head(), data.info()

# Setting the aesthetic style of the plots
sns.set(style="whitegrid")

# Adjust the plot to avoid issues with mismatched tick labels
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle("Exploratory Data Analysis for Heart Disease Prediction Dataset")

# Plot 1: Distribution of 'age' and 'target' (presence of heart disease)
sns.histplot(data, x='age', hue='target', multiple='stack', bins=20, palette='viridis', ax=axes[0, 0])
axes[0, 0].set_title("Age Distribution by Heart Disease Status")

# Plot 2: Heart disease prevalence by sex
sns.countplot(data=data, x='sex', hue='target', palette='viridis', ax=axes[0, 1])
axes[0, 1].set_title("Heart Disease Status by Sex")

# Plot 3: Heart disease prevalence by chest pain type
sns.countplot(data=data, x='cp', hue='target', palette='viridis', ax=axes[1, 0])
axes[1, 0].set_title("Heart Disease Status by Chest Pain Type")

# Plot 4: Correlation heatmap for numerical features
corr_matrix = data[['age', 'trestbps', 'chol', 'fbs', 'thalach', 'oldpeak', 'ca', 'thal', 'target']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True, ax=axes[1, 1])
axes[1, 1].set_title("Correlation Matrix for Numerical Features")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# Hypothesis Testing

In [None]:
try:
    hypothesis_tests = HypothesisTests(logger=logger)
    response = hypothesis_tests.load_data(file=file)
    response = hypothesis_tests.run_all_tests()
    
    if response["success"]:
        validate_hypothesis_tests = ValidateHypothesisTesting(logger=logger, hypothesis_tests=hypothesis_tests)
        response = validate_hypothesis_tests.run_all_tests()
except Exception as e:
    logger.error(f"An unexpected error occured: Error: {e}")
    logger.error(traceback.print_exc())