In [4]:
import pandas as pd
import os
import shutil

# Directory paths for raw and prep folders
RAW_FOLDER = 'raw'
PREP_FOLDER = 'cleaned'

# Ensure the 'prep' folder exists
os.makedirs(PREP_FOLDER, exist_ok=True)

def copy_raw_to_prep(filename):
    """Copy a CSV file from raw to prep, checking for duplicates."""
    src_file = os.path.join(RAW_FOLDER, filename)
    dest_file = os.path.join(PREP_FOLDER, filename)

    # Read the CSV file
    df = pd.read_csv(src_file)

    # Remove duplicates based on all columns (you can adjust this based on your needs)
    df.drop_duplicates(inplace=True)

    # Write the cleaned DataFrame to the destination CSV file
    df.to_csv(dest_file, index=False)
    print(f"{filename} copied from raw to prep with duplicates removed.")

def clean_csv(input_file, output_file, clean_function):
    """General function to clean CSV based on a given clean function."""
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Apply the cleaning function to each row
    cleaned_rows = df.apply(clean_function, axis=1)

    # Filter out None values (rows that were flagged for removal)
    cleaned_df = cleaned_rows.dropna().reset_index(drop=True)

    # Write the cleaned DataFrame to the output file
    cleaned_df.to_csv(output_file, index=False)
    print(f"Cleaned data saved to {output_file}.")

def clean_certificates_row(row):
    """Clean certificates row."""
    if pd.notna(row['user_id']) and pd.notna(row['assignment_id']):
        return row  # Return the row if valid
    return None  # Otherwise, skip this row

def clean_course_assignments_row(row):
    """Clean course assignments row."""
    if pd.notna(row['user_id']) and pd.notna(row['course_id']):
        # Set default values for null fields
        row['quiz_score'] = row.get('quiz_score', 0) or 0
        row['engagement_score'] = row.get('engagement_score', 0) or 0
        row['assignment_grade'] = row.get('assignment_grade', 0) or 0
        row['progress'] = row.get('progress', 0) or 0
        return row
    return None

def clean_performance_ratings_row(row):
    """Clean performance ratings row."""
    if pd.notna(row['user_id']) and pd.notna(row['assignment_id']):
        # Set default value for null rating
        row['rating'] = row.get('rating', 0) or 0
        return row
    return None

def clean_performance_summaries_row(row):
    """Clean performance summaries row."""
    if pd.notna(row['user_id']) and pd.notna(row['learning_path_id']):
        # Set default value for null average_rating
        row['average_rating'] = row.get('average_rating', 0) or 0
        return row
    return None

# Clean relevant CSV files and save to 'prep' folder
clean_csv(os.path.join(RAW_FOLDER, 'certificates.csv'), os.path.join(PREP_FOLDER, 'certificates_cleaned.csv'), clean_certificates_row)
clean_csv(os.path.join(RAW_FOLDER, 'course_assignments.csv'), os.path.join(PREP_FOLDER, 'course_assignments_cleaned.csv'), clean_course_assignments_row)
clean_csv(os.path.join(RAW_FOLDER, 'performance_ratings.csv'), os.path.join(PREP_FOLDER, 'performance_ratings_cleaned.csv'), clean_performance_ratings_row)
clean_csv(os.path.join(RAW_FOLDER, 'performance_summaries.csv'), os.path.join(PREP_FOLDER, 'performance_summaries_cleaned.csv'), clean_performance_summaries_row)

# Directly copy CSV files that do not require cleaning, while checking for duplicates
copy_raw_to_prep('users.csv')
copy_raw_to_prep('courses.csv')
copy_raw_to_prep('learning_paths.csv')
copy_raw_to_prep('course_learning_paths.csv')

print("Data cleaning and copying completed. Files saved to the 'prep' folder.")


Cleaned data saved to cleaned\certificates_cleaned.csv.
Cleaned data saved to cleaned\course_assignments_cleaned.csv.
Cleaned data saved to cleaned\performance_ratings_cleaned.csv.
Cleaned data saved to cleaned\performance_summaries_cleaned.csv.
users.csv copied from raw to prep with duplicates removed.
courses.csv copied from raw to prep with duplicates removed.
learning_paths.csv copied from raw to prep with duplicates removed.
course_learning_paths.csv copied from raw to prep with duplicates removed.
Data cleaning and copying completed. Files saved to the 'prep' folder.
