# PCA

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

def process_heart_disease_data(input_filename, output_filename_prefix):
    """
    This function processes heart disease data by applying PCA and splitting it into training and test sets.

    Parameters:
    input_filename (str): The name of the CSV file containing the dataset.
    output_filename_prefix (str): The prefix for output CSV files containing the processed data.

    Returns:
    tuple: Returns a tuple containing the PCA-applied feature matrix X, labels y, and the split datasets X_train_pca, X_test_pca, y_train_pca, y_test_pca.
    """
    # Load the dataset
    wine_quality = pd.read_csv(input_filename)

    # Fill missing values with mean
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    y = wine_quality['quality']
    X = imputer.fit_transform(wine_quality.drop(['quality'], axis=1))

    # Apply PCA to reduce dimensionality to 6 principal components
    pca = PCA(n_components=6)
    X_pca = pca.fit_transform(X)

    # Convert the PCA transformed data and labels to a DataFrame and specify the column names
    X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(6)])
    y_df = pd.DataFrame(y, columns=['quality']).reset_index(drop=True)

    # Combine PCA transformed data and labels
    combined_df = pd.concat([X_pca_df, y_df], axis=1)
    
    # Save the PCA transformed data set as CSV file
    combined_df.to_csv(f'{output_filename_prefix}.csv', index=False)

    # Split training set and test set
    X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca_df, y_df, test_size=0.1, random_state=42)

    # Save the training set and test set as CSV files
    train_pca_df = pd.concat([X_train_pca, y_train_pca], axis=1)
    test_pca_df = pd.concat([X_test_pca, y_test_pca], axis=1)
    train_pca_df.to_csv(f'{output_filename_prefix}_train.csv', index=False)
    test_pca_df.to_csv(f'{output_filename_prefix}_test.csv', index=False)

    return X_pca_df, y_df, X_train_pca, X_test_pca, y_train_pca, y_test_pca

In [2]:
X_pca, y, X_train, X_test, y_train, y_test = process_heart_disease_data('processed_wine_quality.csv', 'processed_wine_quality_pca')

# Feature Selection - Random forest feature importance evaluation

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def feature_selection_and_split(input_filename, output_filename_prefix, n_estimators_num):
    """
    This function performs feature selection using a RandomForestClassifier and splits the data into training and test sets.

    Parameters:
    input_filename (str): The name of the CSV file containing the dataset.
    output_filename_prefix (str): The prefix for output CSV files containing the processed data.

    Returns:
    tuple: Returns a tuple containing the selected features X, labels y, and the split datasets X_train_selected, X_test_selected, y_train_selected, y_test_selected.
    """
    # Load the dataset
    wine_quality = pd.read_csv(input_filename)

    # Handle missing values
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X = imputer.fit_transform(wine_quality.drop(['quality'], axis=1))
    y = wine_quality['quality']

    # Train a random forest model to obtain feature importance
    rf = RandomForestClassifier(n_estimators= n_estimators_num, random_state=42)
    rf.fit(X, y)

    # Get feature importance scores and sort them
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Pick half of the most important features
    num_features = len(indices) // 2
    selected_indices = indices[:num_features]
    X_selected = X[:, selected_indices]

    # Create DataFrames for the selected features and labels
    X_selected_df = pd.DataFrame(X_selected, columns=[f'Feature_{i+1}' for i in range(num_features)])
    y_df = pd.DataFrame(y, columns=['quality'])

    combined_df = pd.concat([X_selected_df, y_df], axis=1)
    combined_df.to_csv(f'{output_filename_prefix}.csv', index=False)

    # Split the data into training and testing sets
    X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_selected_df, y_df, test_size=0.1, random_state=42)

    # Save the training and test sets as CSV files
    train_selected_df = pd.concat([X_train_selected, y_train_selected], axis=1)
    test_selected_df = pd.concat([X_test_selected, y_test_selected], axis=1)
    train_selected_df.to_csv(f'{output_filename_prefix}_train.csv', index=False)
    test_selected_df.to_csv(f'{output_filename_prefix}_test.csv', index=False)

    return X_selected_df, y_df, X_train_selected, X_test_selected, y_train_selected, y_test_selected

In [4]:
X_selected, y, X_train, X_test, y_train, y_test = feature_selection_and_split('processed_wine_quality.csv', 'processed_wine_quality_feature_selected', 250)