In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

def preprocess_data(df, target_column):
    # Interpolate missing values for numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].interpolate(method='linear', limit_direction='forward', axis=0)
    
    # Fill any remaining NaN values after interpolation with mean
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # Convert 'Value' column to numeric with coercion
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
    
    # Drop rows with NaN values in 'Value' column
    df.dropna(subset=['Value'], inplace=True)
    
    # Encode categorical variables
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])
    
    # Replace infinite values with NaN, then fill with mean
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # Check for NaN or infinite values before scaling
    if df[numeric_cols].isnull().values.any():
        print("NaN values found in numeric columns before scaling:")
        print(df[numeric_cols].isnull().sum())
    
    if np.isinf(df[numeric_cols].values).any():
        print("Infinite values found in numeric columns before scaling.")
    
    # Normalize/Standardize numerical features
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    # Replace any remaining NaN or infinite values with 0
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    
    # Check for NaN or infinite values after scaling
    if df[numeric_cols].isnull().values.any():
        print("NaN values found in the dataset after scaling:")
        print(df[numeric_cols].isnull().sum())
    
    if np.isinf(df[numeric_cols].values).any():
        print("Infinite values found in the dataset after scaling.")
    
    # Split data into training and testing sets
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

# Load the CSV file
file_path = 'F:/HealthMate AI_Project/data/raw/WHO-GHO/WHO.csv' 
df = pd.read_csv(file_path)

# Inspect the dataset
print(df.info())
print(df.describe())

# Define the target column
target_column = 'Value'  # Replace with the actual target column if different

# Preprocess the data
X_train, X_test, y_train, y_test = preprocess_data(df, target_column)

# Save preprocessed data
processed_dir = 'F:/HealthMate AI_Project/data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save dataframes to CSV files
X_train.to_csv(os.path.join(processed_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(processed_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(processed_dir, 'y_test.csv'), index=False)

print("Data saved successfully.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9333 entries, 0 to 9332
Data columns (total 34 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   IndicatorCode               9333 non-null   object 
 1   Indicator                   9333 non-null   object 
 2   ValueType                   9333 non-null   object 
 3   ParentLocationCode          9333 non-null   object 
 4   ParentLocation              9333 non-null   object 
 5   Location type               9333 non-null   object 
 6   SpatialDimValueCode         9333 non-null   object 
 7   Location                    9333 non-null   object 
 8   Period type                 9333 non-null   object 
 9   Period                      9333 non-null   int64  
 10  IsLatestYear                9333 non-null   bool   
 11  Dim1 type                   9333 non-null   object 
 12  Dim1                        9333 non-null   object 
 13  Dim1ValueCode               9333 

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Data saved successfully.
