# Importing the libraries

In [9]:
import pandas as pd
import numpy as np
import os

# Cleaning the data

In [None]:

#Load the raw data from a specified file path.
def load_data(file_path):
    return pd.read_csv(file_path)   


# Clean the data by handling missing values and duplicates.
def clean_data(df):
    # Remove duplicates
    df = df.drop_duplicates()

    # Fill missing numerical values with the mean
    for column in df.select_dtypes(include=[np.number]).columns:
        df[column].fillna(df[column].mean(), inplace=True)
    
    # Example: Fill missing categorical values with the mode
    for column in df.select_dtypes(include=[object]).columns:
        df[column].fillna(df[column].mode()[0], inplace=True)
    
    return df


# Normalize numerical features
def normalize_data(df):
    for column in df.select_dtypes(include=[np.number]).columns:
        df[column] = (df[column] - df[column].mean()) / df[column].std()
    return df

def transform_data(input_file_path, output_file_path):
    # Load raw data
    df = load_data(input_file_path)
    
    # Clean the data
    df = clean_data(df)
        
    # Normalize numerical features
    # df = normalize_data(df)
    
    # Save transformed data
    df.to_csv(output_file_path, index=False)
    print(f"Transformed data saved to {output_file_path}")

def main():
    raw_data_dir = '../../data/raw_data/'
    processed_data_dir = '../../data/processed_data/'

    # List of datasets to process
    datasets = [
        'employee_data.csv',
        'course_data.csv',
        'modules_data.csv',
        'enrollment_data.csv',
        'performance_metrics_data.csv',
        'enrollment_with_modules_data.csv'
    ]

    for dataset in datasets:
        input_file_path = os.path.join(raw_data_dir, dataset)
        output_file_path = os.path.join(processed_data_dir, f'transformed_{dataset}')
        transform_data(input_file_path, output_file_path)

if __name__ == "__main__":
    main()
