In [1]:
from pathlib import Path

file_path = Path("C:/Users/ProjectC4M/Documents/CPCSSN Datasets Care4Mind/New Extraction Feb 2025/extracted_data/C4MRiskFactor.csv")

# Check if the file exists and its size
print(f"File exists: {file_path.exists()}")
print(f"File size (bytes): {file_path.stat().st_size}")

# Open the file and print the first 10 lines
with open(file_path, "r", encoding="utf-8") as f:
    for i in range(10):
        print(f.readline())


File exists: True
File size (bytes): 135240508
"RiskFactor_ID"|"Network_ID"|"Site_ID"|"Patient_ID"|"Encounter_ID"|"Cycle_ID"|"StartDate"|"EndDate"|"Name_orig"|"Name_calc"|"Value_orig"|"Value_calc"|"Status_orig"|"Status_calc"|"Frequency"|"FrequencyType"|"FrequencyUnit"|"Duration"|"DurationType"|"DurationUnit"|"EndDuration"|"EndDurationType"|"EndDurationUnit"|"RiskDetails"|"DateCreated"

"1002000000000002"|"1"|"1000000000000004"|"1002000000016545"|""|"2015-Q2"|""|""|"Quit > 1 year"|"Smoking"|"0.5/day"|"Quit  more than  1 year ,0.5 per day "|"Past"|""|""|""|""|""|""|""|""|""|""|""|"1900-01-01 00:00:00"

"1002000000000004"|"1"|"1000000000000004"|"1002000000029743"|""|"2015-Q2"|""|""|"Non Smoker"|"Smoking"|"0.0/day"|"Non Smoker ,0.0 per day "|"Never"|""|""|""|""|""|""|""|""|""|""|""|"1900-01-01 00:00:00"

"1002000000000005"|"1"|"1000000000000004"|"1002000000003382"|""|"2015-Q2"|""|""|"Smoker"|"Smoking"|"0.0/day"|"Smoker ,0.0 per day "|"Current"|""|""|""|""|""|""|""|""|""|""|""|"1900-01-01 0

In [2]:
delimiter = "|"
expected_delimiter_count = 24  # 25 fields => 24 delimiters

with open(file_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        count = line.count(delimiter)
        if count != expected_delimiter_count:
            print(f"Line {i} has {count} delimiters (expected {expected_delimiter_count}):")
            print(line[:300])  # print first 300 characters for context


Line 171819 has 11 delimiters (expected 24):
"4001000000000066"|"4"|"4000000000000001"|"4001000000001220"|""|"2015-Q2"|""|""|"Alcohol  wine Occasional##"|"Alcohol"|"Occasional##"|"Alcohol  wine  occasionally \

Line 171820 has 0 delimiters (expected 24):
 occasionally \

Line 171821 has 13 delimiters (expected 24):
"|"Current"|""|""|""|""|""|""|""|""|""|""|""|"2008-05-11 00:00:00"

Line 171992 has 11 delimiters (expected 24):
"4001000000000313"|"4"|"4000000000000001"|"4001000000006089"|""|"2015-Q2"|""|""|"Alcohol   1-2 alcoholic drinks per day on average##(wine or scotch)"|"Alcohol"|"1-2 alcoholic drinks per day on average##(wine or scotch)"|"Alcohol   1-2 alcoholic drinks  per day  on average\

Line 171993 has 0 delimiters (expected 24):
(wine or scotch) ,1-2 alcoholic drinks  per day  on average\

Line 171994 has 13 delimiters (expected 24):
(wine or scotch)"|"Current"|""|""|""|""|""|""|""|""|""|""|""|"2011-07-28 00:00:00"

Line 172155 has 11 delimiters (expected 24):
"40010000000005

In [3]:
## i'll create a new CSV containing all the malformed rows

from pathlib import Path
import csv

# Set up file paths
input_path = Path("C:/Users/ProjectC4M/Documents/CPCSSN Datasets Care4Mind/New Extraction Feb 2025/extracted_data/C4MRiskFactor.csv")
malformed_output = Path("C:/Users/ProjectC4M/Documents/CPCSSN Datasets Care4Mind/New Extraction Feb 2025/extracted_data/malformed_rows.csv")

delimiter = "|"
expected_delimiter_count = 24  # For 25 fields

# Open the input file and a new output file for the malformed rows.
with input_path.open("r", encoding="utf-8") as infile, \
     malformed_output.open("w", encoding="utf-8", newline="") as outfile:
    
    # Write a header for our log file: we record the line number and the raw line content.
    writer = csv.writer(outfile, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["LineNumber", "Content"])
    
    for line_number, line in enumerate(infile, start=1):
        # Count how many times the delimiter appears in the line.
        count = line.count(delimiter)
        if count != expected_delimiter_count:
            writer.writerow([line_number, line.strip()])

print(f"Malformed rows have been extracted to: {malformed_output}")


Malformed rows have been extracted to: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\extracted_data\malformed_rows.csv


# Fix
 The idea is to read the original file line‐by‐line and “accumulate” any rows that appear to be broken (i.e. that don’t have the expected number of delimiters). In our case we expect 24 pipe characters per “good” row (25 fields). When we see a line with too few pipes, we remove any trailing backslash (which seems to be the marker used in your file) and then append the next line until the accumulated line has the required number of delimiters. Finally, we write each fixed (joined) row to a new file (named with a _fixed.csv suffix).

In [4]:
import os

# Define file paths – adjust these as needed
input_path = r"C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\extracted_data\C4MRiskFactor.csv"
output_path = r"C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\extracted_data\C4MRiskFactor_fixed.csv"

delimiter = "|"
expected_delimiter_count = 24  # 25 fields => 24 pipes

# Open the input file for reading and output file for writing.
with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8", newline="") as fout:
    
    buffer = ""  # to hold the accumulating row
    
    for raw_line in fin:
        # Remove the trailing newline character
        line = raw_line.rstrip("\n")
        
        # If we’re in the middle of a broken row, append a space before adding.
        if buffer:
            # Check if previous part ended with a backslash: if so, remove it.
            if buffer.endswith("\\"):
                buffer = buffer[:-1].rstrip()
            buffer += " " + line
        else:
            buffer = line

        # Check how many delimiters are in the current accumulated buffer.
        current_delim_count = buffer.count(delimiter)
        
        # If not enough delimiters, then the record is still incomplete – continue reading.
        if current_delim_count < expected_delimiter_count:
            continue

        # Otherwise, we assume we have a complete row.
        # (Optionally, you could check for > expected_delimiter_count and log a warning.)
        fout.write(buffer + "\n")
        
        # Clear the buffer for the next record.
        buffer = ""

    # If something is still left in the buffer after the loop ends, write it (or log a warning)
    if buffer:
        if buffer.count(delimiter) == expected_delimiter_count:
            fout.write(buffer + "\n")
        else:
            print("Warning: Incomplete record at end of file.")
            
print(f"Fixed file saved as: {output_path}")


Fixed file saved as: C:\Users\ProjectC4M\Documents\CPCSSN Datasets Care4Mind\New Extraction Feb 2025\extracted_data\C4MRiskFactor_fixed.csv
