In [17]:
# Import necessary libraries for data manipulation and file handling
import csv  # For handling CSV file operations
import glob  # For finding all matching file paths
from collections import defaultdict  # For dictionary-like objects with default values
from collections import Counter  # For counting hashable items efficiently

In [18]:
# Define the directory containing CSV files and the output file path
data_dir = "Group9/9"  # Path to the directory containing the data files
output_file = "Data/impc_data.csv"  # Name of the output file where cleaned data will be saved

# Get all CSV files in the specified directory
csv_files = glob.glob(data_dir + "/*.csv")  # List all files with .csv extension in the directory

In [19]:
# Initialize a dictionary to hold combined data
combined_data = {}

# Iterate over all CSV files to merge and clean data
for file in csv_files:
    with open(file, mode="r") as f:
        reader = csv.reader(f)
        for row in reader:
            # Convert keys and values to lowercase for consistency; handle missing values
            key = row[0].lower()  # Convert the key (first column) to lowercase
            if row[1].strip():  # Check if the value (second column) is not empty
                value = row[1].lower()  # Convert value to lowercase
            else:
                value = "--"  # Assign a placeholder for missing values
                print(key + " " + file)  # Log the missing value for reference
            
            # Append the value to the list of values for the corresponding key
            if key not in combined_data:
                combined_data[key] = []  # Initialize an empty list if the key is new
            combined_data[key].append(value)

# Convert specific fields to appropriate data types
for k, v in combined_data.items():
    if k == "pvalue":  # Identify the key to be converted
        combined_data[k] = [float(i) for i in v]  # Convert each value to a float
        # Uncomment the next line to handle values exceeding a threshold (e.g., 1.0)
        # combined_data[k] = [None if x > 1.0 else x for x in combined_data[k]]

# Remove duplicate values from the lists for each key
unique_combined_data = {key: list(set(values)) for key, values in combined_data.items()}  # Ensure uniqueness in the values

In [20]:
# Write the combined dictionary to a CSV file
with open(output_file, mode="w", newline="") as f:
    writer = csv.writer(f)

    # Write the header row: keys become the column names
    header = list(combined_data.keys())
    writer.writerow(header)

    # Transpose the dictionary values to rows
    max_values = max(len(values) for values in combined_data.values()) 
    for i in range(max_values):
        row = [
            combined_data[key][i] if i < len(combined_data[key]) else ""
            for key in combined_data
        ]
        writer.writerow(row)