In [2]:
import csv
import os
import shutil

# Directory paths for raw and prep folders
RAW_FOLDER = 'raw'
PREP_FOLDER = 'prep'

# Ensure the 'prep' folder exists
os.makedirs(PREP_FOLDER, exist_ok=True)

def copy_raw_to_prep(filename):
    """Directly copy a CSV file from raw to prep."""
    src_file = os.path.join(RAW_FOLDER, filename)
    dest_file = os.path.join(PREP_FOLDER, filename)
    
    # Copy file from raw to prep folder
    shutil.copy(src_file, dest_file)
    print(f"{filename} copied from raw to prep.")

def clean_csv(input_file, output_file, clean_function):
    """General function to clean CSV based on a given clean function."""
    with open(input_file, mode='r') as infile, open(output_file, mode='w', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in reader:
            # Call the provided clean function
            clean_row = clean_function(row)
            if clean_row:  # Only write row if it's not None (i.e., it's valid)
                # Ensure the row only contains valid fieldnames
                filtered_row = {key: clean_row[key] for key in fieldnames if key in clean_row}
                writer.writerow(filtered_row)

def clean_certificates_row(row):
    # Remove rows with null 'user_id' or 'assignment_id'
    if row['user_id'] and row['assignment_id']:
        return row  # Return the row if valid
    return None  # Otherwise, skip this row

def clean_course_assignments_row(row):
    # Remove rows with null 'user_id' or 'course_id'
    if row['user_id'] and row['course_id']:
        # Set default values for null fields
        row['quiz_score'] = row['quiz_score'] if row['quiz_score'] else 0
        row['engagement_score'] = row['engagement_score'] if row['engagement_score'] else 0
        row['assignment_grade'] = row['assignment_grade'] if row['assignment_grade'] else 0
        row['progress'] = row['progress'] if row['progress'] else 0
        return row
    return None

def clean_performance_ratings_row(row):
    # Remove rows with null 'user_id' or 'assignment_id'
    if row['user_id'] and row['assignment_id']:
        # Set default value for null rating
        row['rating'] = row['rating'] if row['rating'] else 0
        return row
    return None

def clean_performance_summaries_row(row):
    # Remove rows with null 'user_id' or 'learning_path_id'
    if row['user_id'] and row['learning_path_id']:
        # Set default value for null average_rating
        row['average_rating'] = row['average_rating'] if row['average_rating'] else 0
        return row
    return None

# Clean relevant CSV files and save to 'prep' folder
clean_csv(os.path.join(RAW_FOLDER, 'certificates.csv'), os.path.join(PREP_FOLDER, 'certificates_cleaned.csv'), clean_certificates_row)
clean_csv(os.path.join(RAW_FOLDER, 'course_assignments.csv'), os.path.join(PREP_FOLDER, 'course_assignments_cleaned.csv'), clean_course_assignments_row)
clean_csv(os.path.join(RAW_FOLDER, 'performance_ratings.csv'), os.path.join(PREP_FOLDER, 'performance_ratings_cleaned.csv'), clean_performance_ratings_row)
clean_csv(os.path.join(RAW_FOLDER, 'performance_summaries.csv'), os.path.join(PREP_FOLDER, 'performance_summaries_cleaned.csv'), clean_performance_summaries_row)

# Directly copy CSV files that do not require cleaning
copy_raw_to_prep('users.csv')
copy_raw_to_prep('courses.csv')
copy_raw_to_prep('learning_paths.csv')
copy_raw_to_prep('course_learning_paths.csv')

print("Data cleaning and copying completed. Files saved to the 'prep' folder.")


users.csv copied from raw to prep.
courses.csv copied from raw to prep.
learning_paths.csv copied from raw to prep.
course_learning_paths.csv copied from raw to prep.
Data cleaning and copying completed. Files saved to the 'prep' folder.
