In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.decomposition import PCA

# Load the healthcare dataset 
file_path = "health_dataset.csv" 
df = pd.read_csv(file_path)

# Display first few rows
display(df.head())

# Identify non-numeric columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

# Convert categorical columns using Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Ensure all columns are numeric
df = df.apply(pd.to_numeric, errors='coerce')  # Converts non-numeric values to NaN
df.fillna(0, inplace=True)  # Replace NaNs with 0

# Separate features and target
target_column = 'Disease Risk Score'
if target_column not in df.columns:
    raise ValueError("Target column not found in dataset")

X = df.drop(columns=[target_column])
y = df[target_column]

# Check for missing values in target and encode if necessary
if y.isnull().sum() > 0:
    y.fillna(y.mode()[0], inplace=True)  # Replace NaN with most frequent value

if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Remove constant features
var_thresh = VarianceThreshold(threshold=0)
X = var_thresh.fit_transform(X)

# Feature Selection using ANOVA F-score
selector = SelectKBest(score_func=f_classif, k=min(10, X.shape[1]))  # Ensure k does not exceed feature count
X_selected = selector.fit_transform(X, y)
selected_features = [col for col, keep in zip(df.drop(columns=[target_column]).columns, selector.get_support()) if keep]
print("Selected Features:", selected_features)

# Normalization using Min-Max Scaling
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X_selected)

# Transformation using Power Transform (Box-Cox or Yeo-Johnson)
power_transformer = PowerTransformer(method='yeo-johnson')  # Use 'box-cox' if no negative values
X_transformed = power_transformer.fit_transform(X_normalized)

# Dimensionality Reduction using PCA
pca_components = min(5, X_transformed.shape[1])  # Ensure PCA components do not exceed feature count
pca = PCA(n_components=pca_components)
X_pca = pca.fit_transform(X_transformed)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Convert processed data back to DataFrame
processed_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
processed_df['Disease Risk Score'] = y.reset_index(drop=True)

# Save processed dataset
processed_df.to_csv("processed_health_dataset.csv", index=False)
print("Processed dataset saved successfully.")


Unnamed: 0,ID,Age,Height (cm),Weight (kg),Blood Pressure (BP),Cholesterol Level,Diabetes,Physical Activity (hours/week),Smoking Habit,Disease Risk Score,Unnamed: 10
0,1,56,159,77,122,High,1,1.0,0,59.0,
1,2,69,185,77,157,Normal,0,0.53,1,48.0,
2,3,46,163,93,122,Low,1,9.59,0,42.05,
3,4,32,180,79,103,Normal,1,8.47,0,78.47,
4,5,60,197,111,110,Normal,1,3.55,1,63.94,


Categorical Columns: Index(['Cholesterol Level'], dtype='object')
Selected Features: ['ID', 'Age', 'Height (cm)', 'Weight (kg)', 'Blood Pressure (BP)', 'Cholesterol Level', 'Diabetes', 'Physical Activity (hours/week)', 'Smoking Habit']
Explained Variance Ratio: [0.16779905 0.13859449 0.11963605 0.1154552  0.11072075]
Processed dataset saved successfully.


  msw = sswn / float(dfwn)
