In [None]:
from google.colab import drive
import os
import warnings
warnings.filterwarnings('ignore')

def mount_google_drive(drive_path='/content/drive'):
    """
    Mounts Google Drive and verifies the dataset file exists.

    Parameters:
        drive_path (str): Path where Google Drive will be mounted (default: /content/drive)

    Returns:
        bool: True if mounted successfully and dataset exists, False otherwise
    """
    try:
        # Mount Google Drive
        print("Mounting Google Drive...")
        drive.mount(drive_path, force_remount=True)

        # Verify mount
        if os.path.exists(drive_path):
            print(f"Google Drive mounted successfully at {drive_path}")

            # Example: Check if your dataset exists (replace with your dataset path)
            dataset_path = os.path.join(drive_path, 'MyDrive/your_dataset.csv')  # Adjust path
            if os.path.exists(dataset_path):
                print(f"Dataset found at {dataset_path}")
                return True
            else:
                print(f"Error: Dataset not found at {dataset_path}")
                return False
        else:
            print(f"Error: Drive mount path {drive_path} does not exist")
            return False

    except Exception as e:
        print(f"Error during Google Drive mounting: {str(e)}")
        return False

def list_drive_contents(drive_path='/content/drive/MyDrive', max_files=10):
    """
    Lists contents of a directory in Google Drive to help locate your dataset.

    Parameters:
        drive_path (str): Path to list contents from
        max_files (int): Maximum number of files/folders to display
    """
    try:
        if os.path.exists(drive_path):
            print(f"\nListing contents of {drive_path}:")
            files = os.listdir(drive_path)[:max_files]
            for file in files:
                print(f" - {file}")
            if len(files) == 0:
                print("Directory is empty")
        else:
            print(f"Error: Directory {drive_path} does not exist")

    except Exception as e:
        print(f"Error listing directory contents: {str(e)}")

if __name__ == "__main__":
    # Mount Google Drive
    success = mount_google_drive()

    if success:
        # List contents of MyDrive to help locate dataset
        list_drive_contents()
    else:
        print("Failed to mount Google Drive or locate dataset. Please check your setup.")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def clean_data(file_path, output_path='cleaned_dataset.csv'):
    try:
        # Load dataset
        df = pd.read_csv('/content/drive/MyDrive/archive.zip')

        # Check if dataset is empty
        if df.empty:
            raise ValueError("Dataset is empty")

        # Display initial info
        print("Initial Dataset Info:")
        print(df.info())
        print("\nInitial Dataset Description:")
        print(df.describe())

        # Handle missing values
        if df.isnull().sum().any():
            print("Missing values detected. Filling with median for numerical columns...")
            df = df.fillna(df.median(numeric_only=True))

        # Remove duplicates
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            print(f"Removing {duplicates} duplicate rows...")
            df = df.drop_duplicates()

        # Convert categorical variables to numeric using one-hot encoding
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        if len(categorical_cols) > 0:
            print("Encoding categorical columns...")
            df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

        # Ensure all data is numeric
        df = df.astype(float)

        # Handle infinite values
        if np.any(np.isinf(df)):
            print("Infinite values detected. Replacing with column mean...")
            df = df.replace([np.inf, -np.inf], np.nan).fillna(df.mean())

        # Check if target column exists
        if 'target' not in df.columns:
            raise ValueError("Target column 'target' not found in dataset")

        # Save cleaned dataset
        df.to_csv(output_path, index=False)
        print(f"\nData Cleaning Completed! Cleaned dataset saved to {output_path}")
        return df

    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
        return None
    except Exception as e:
        print(f"Error during data cleaning: {str(e)}")
        return None

if __name__ == "__main__":
    file_path = 'your_dataset.csv'  # Replace with your dataset file path
    clean_data(file_path)

Data Normalization

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def normalize_data(file_path, output_path='normalized_dataset.csv'):
    try:
        # Load cleaned dataset
        df = pd.read_csv('/content/drive/MyDrive/archive.zip')

        # Check if dataset is empty
        if df.empty:
            raise ValueError("Dataset is empty")

        # Check if target column exists
        if 'target' not in df.columns:
            raise ValueError("Target column 'target' not found in dataset")

        # Separate features and target
        X = df.drop('target', axis=1)
        y = df['target']

        # Scale features
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

        # Combine scaled features with target
        df_normalized = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)

        # Save normalized dataset
        df_normalized.to_csv(output_path, index=False)
        print(f"\nData Normalization Completed! Normalized dataset saved to {output_path}")
        return df_normalized, scaler

    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
        return None, None
    except Exception as e:
        print(f"Error during data normalization: {str(e)}")
        return None, None

if __name__ == "__main__":
    file_path = 'cleaned_dataset.csv'  # Replace with your cleaned dataset file path
    normalize_data(file_path)

Data Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

def extract_zip(zip_path, extract_path='/content/extracted_data'):
    """
    Extracts a zip file to the specified directory and returns the path to the first .csv file found.

    Parameters:
        zip_path (str): Path to the zip file
        extract_path (str): Directory to extract files to

    Returns:
        str: Path to the extracted .csv file, or None if not found
    """
    try:
        os.makedirs(extract_path, exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"Extracted zip file to {extract_path}")

        # Find the first .csv file in the extracted directory
        for file in os.listdir(extract_path):
            if file.endswith('.csv'):
                return os.path.join(extract_path, file)
        print("Error: No .csv file found in the zip archive")
        return None

    except Exception as e:
        print(f"Error extracting zip file: {str(e)}")
        return None

def visualize_data(file_path, save_dir='/content/drive/MyDrive/HeartDiseasePlots'):
    """
    Visualizes the heart disease dataset with various plots and displays them interactively.

    Parameters:
        file_path (str): Path to the dataset (.csv or .zip)
        save_dir (str): Directory to save plots
    """
    try:
        # Check if file is a zip
        if file_path.endswith('.zip'):
            file_path = extract_zip(file_path)
            if file_path is None:
                return

        # Load dataset
        df = pd.read_csv(file_path)

        # Check if dataset is empty
        if df.empty:
            raise ValueError("Dataset is empty")

        # Check if target column exists
        if 'target' not in df.columns:
            raise ValueError("Target column 'target' not found in dataset")

        # Create directory for saving plots
        os.makedirs(save_dir, exist_ok=True)
        print(f"Saving plots to {save_dir}")

        # 1. Correlation Heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
        plt.title('Correlation Heatmap of Heart Disease Features')
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, 'correlation_heatmap.png'))
        plt.show()  # Display plot
        print("Displayed and saved correlation_heatmap.png")

        # 2. Pairplot for selected features (limit to 5 features + target)
        selected_cols = df.columns[:5].tolist() + ['target']
        sns.pairplot(df[selected_cols], hue='target', palette='coolwarm')
        plt.suptitle('Pairplot of Selected Features (Colored by Target)', y=1.02)
        plt.savefig(os.path.join(save_dir, 'pairplot.png'))
        plt.show()  # Display plot
        print("Displayed and saved pairplot.png")

        # 3. Distribution of Target
        plt.figure(figsize=(8, 6))
        sns.histplot(df['target'], kde=True, color='purple')
        plt.title('Distribution of Target (Heart Disease Risk)')
        plt.xlabel('Target Value')
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, 'target_distribution.png'))
        plt.show()  # Display plot
        print("Displayed and saved target_distribution.png")

        # 4. Box Plot for Key Features by Target
        key_features = ['age', 'chol', 'thalach']  # Adjust based on your dataset
        valid_features = [col for col in key_features if col in df.columns]
        if valid_features:
            plt.figure(figsize=(12, 6))
            for i, feature in enumerate(valid_features, 1):
                plt.subplot(1, len(valid_features), i)
                sns.boxplot(x='target', y=feature, data=df, palette='coolwarm')
                plt.title(f'{feature} by Target')
            plt.tight_layout()
            plt.savefig(os.path.join(save_dir, 'boxplot_features.png'))
            plt.show()  # Display plot
            print("Displayed and saved boxplot_features.png")

        # 5. Scatter Plot for Two Key Features
        if 'age' in df.columns and 'chol' in df.columns:
            plt.figure(figsize=(8, 6))
            sns.scatterplot(x='age', y='chol', hue='target', size='target', data=df, palette='coolwarm')
            plt.title('Age vs Cholesterol (Colored by Target)')
            plt.tight_layout()
            plt.savefig(os.path.join(save_dir, 'scatter_age_chol.png'))
            plt.show()  # Display plot
            print("Displayed and saved scatter_age_chol.png")

        print("\nData Visualization Completed!")

    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
    except Exception as e:
        print(f"Error during data visualization: {str(e)}")

if __name__ == "__main__":
    # Specify the path to your zip file or csv in Google Drive
    file_path = '/content/drive/MyDrive/archive.zip'  # Adjust to your file path
    visualize_data(file_path)

Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def extract_features(file_path, output_path='selected_features_dataset.csv', threshold=0.1):
    """
    Extracts features based on correlation with the target variable.

    Parameters:
        file_path (str): Path to the input dataset (e.g., normalized dataset)
        output_path (str): Path to save the dataset with selected features
        threshold (float): Minimum absolute correlation threshold for feature selection

    Returns:
        pandas.DataFrame: Dataset with selected features and target, or None if error
    """
    try:
        # Load normalized dataset
        print(f"Loading dataset from {file_path}")
        df = pd.read_csv(file_path)

        # Check if dataset is empty
        if df.empty:
            raise ValueError("Dataset is empty")

        # Display basic info for debugging
        print("Dataset Info:")
        print(df.info())

        # Check if target column exists
        if 'target' not in df.columns:
            raise ValueError("Target column 'target' not found in dataset. Available columns: " + str(df.columns.tolist()))

        # Separate features and target
        X = df.drop('target', axis=1)
        y = df['target']

        # Ensure all features are numeric
        non_numeric_cols = X.select_dtypes(exclude=['float64', 'int64']).columns
        if len(non_numeric_cols) > 0:
            print(f"Warning: Non-numeric columns detected: {non_numeric_cols}. Converting to numeric where possible...")
            for col in non_numeric_cols:
                try:
                    X[col] = pd.to_numeric(X[col], errors='coerce')
                except:
                    raise ValueError(f"Cannot convert column {col} to numeric")
            # Fill any NaNs from conversion with median
            X = X.fillna(X.median(numeric_only=True))

        # Check for infinite values
        if np.any(np.isinf(X)):
            print("Infinite values detected. Replacing with column mean...")
            X = X.replace([np.inf, -np.inf], np.nan).fillna(X.mean(numeric_only=True))

        # Calculate correlation with target
        print("Calculating correlations...")
        corr = pd.concat([X, y], axis=1).corr()['target'].abs().sort_values(ascending=False)
        corr = corr.drop('target', errors='ignore')  # Remove self-correlation

        # Select features above threshold
        selected_features = corr[corr > threshold].index.tolist()
        if not selected_features:
            raise ValueError(f"No features have correlation above threshold {threshold}. Try lowering the threshold.")

        # Create dataset with selected features
        X_selected = X[selected_features]
        df_selected = pd.concat([X_selected, y.reset_index(drop=True)], axis=1)

        # Save dataset with selected features
        df_selected.to_csv(output_path, index=False)
        print(f"\nSelected Features: {selected_features}")
        print(f"Feature Extraction Completed! Dataset saved to {output_path}")
        return df_selected

    except FileNotFoundError:
        print(f"Error: File {file_path} not found. Please verify the path.")
        return None
    except Exception as e:
        print(f"Error during feature extraction: {str(e)}")
        return None

if __name__ == "__main__":
    file_path = '/content/extracted_data/heart.csv'  # Path to your dataset
    extract_features(file_path)

Model Training and Evaluation using Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

def train_and_evaluate_model(file_path):
    """
    Trains and evaluates a Logistic Regression model for heart disease prediction.

    Parameters:
        file_path (str): Path to the dataset with selected features

    Returns:
        LogisticRegression: Trained model, or None if error
    """
    try:
        # Load dataset with selected features
        print(f"Loading dataset from {file_path}")
        df = pd.read_csv(file_path)

        # Check if dataset is empty
        if df.empty:
            raise ValueError("Dataset is empty")

        # Check if target column exists
        if 'target' not in df.columns:
            raise ValueError("Target column 'target' not found in dataset. Available columns: " + str(df.columns.tolist()))

        # Separate features and target
        X = df.drop('target', axis=1)
        y = df['target']

        # Ensure target is binary (0 or 1)
        if not y.isin([0, 1]).all():
            raise ValueError("Target column must contain binary values (0 or 1)")

        # Ensure all features are numeric
        non_numeric_cols = X.select_dtypes(exclude=['float64', 'int64']).columns
        if len(non_numeric_cols) > 0:
            print(f"Warning: Non-numeric columns detected: {non_numeric_cols}. Converting to numeric...")
            for col in non_numeric_cols:
                try:
                    X[col] = pd.to_numeric(X[col], errors='coerce')
                except:
                    raise ValueError(f"Cannot convert column {col} to numeric")
            X = X.fillna(X.median(numeric_only=True))

        # Check for infinite values
        if np.any(np.isinf(X)):
            print("Infinite values detected. Replacing with column mean...")
            X = X.replace([np.inf, -np.inf], np.nan).fillna(X.mean(numeric_only=True))

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # Train Logistic Regression model
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Evaluate model
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\nModel Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

        # Feature importance (coefficients)
        feature_importance = pd.DataFrame({
            'Feature': X.columns,
            'Coefficient': model.coef_[0]
        })
        print("\nFeature Importance (Logistic Regression Coefficients):")
        print(feature_importance.sort_values(by='Coefficient', ascending=False))

        # Example: Predict on a new sample
        sample = X.iloc[0].values.reshape(1, -1)
        prediction = model.predict(sample)
        prob = model.predict_proba(sample)[0]
        print(f"\nSample Prediction (0 = No Disease, 1 = Disease): {prediction[0]}")
        print(f"Prediction Probabilities (No Disease, Disease): {prob[0]:.4f}, {prob[1]:.4f}")

        print("\nModel Training and Evaluation Completed!")
        return model

    except FileNotFoundError:
        print(f"Error: File {file_path} not found")
        return None
    except Exception as e:
        print(f"Error during model training/evaluation: {str(e)}")
        return None

if __name__ == "__main__":
    file_path = '/content/selected_features_dataset.csv'  # Path to your selected features dataset
    train_and_evaluate_model(file_path)