fixed_width_parser

In [None]:
import csv

class FixedWidthParser:
    def __init__(self, spec_file, input_file, output_file, encoding="utf-8"):
        self.spec_file = spec_file
        self.input_file = input_file
        self.output_file = output_file
        self.encoding = encoding
        self.column_specs = []

    def parse_spec(self):
        """Reads the spec file and extracts column names and widths."""
        with open(self.spec_file, "r", encoding=self.encoding) as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 2:
                    column_name, width = parts
                    self.column_specs.append((column_name, int(width)))

    def parse_fixed_width_file(self):
        """Reads fixed-width data and writes to CSV."""
        self.parse_spec()
        
        with open(self.input_file, "r", encoding=self.encoding) as infile, \
             open(self.output_file, "w", newline="", encoding=self.encoding) as outfile:
            writer = csv.writer(outfile)
            writer.writerow([col[0] for col in self.column_specs])  # Write header

            for line in infile:
                values = []
                start = 0
                for _, width in self.column_specs:
                    values.append(line[start:start+width].strip())
                    start += width
                writer.writerow(values)

if __name__ == "__main__":
    parser = FixedWidthParser("spec.txt", "input_data/fixed_width.txt", "output_data/output.csv")
    parser.parse_fixed_width_file()
    print("✅ Fixed-width file successfully parsed to CSV!")


data_anonymizer

In [None]:
import csv
import hashlib
import random
import string

class DataAnonymizer:
    def __init__(self, input_file, output_file, chunk_size=10000):
        self.input_file = input_file
        self.output_file = output_file
        self.chunk_size = chunk_size
        self.anonymized_map = {}

    def anonymize_text(self, text):
        """Generate a consistent hash-based anonymized string."""
        if text in self.anonymized_map:
            return self.anonymized_map[text]
        hashed = hashlib.sha256(text.encode()).hexdigest()[:10]
        fake_value = "".join(random.choices(string.ascii_letters, k=10))
        self.anonymized_map[text] = fake_value
        return fake_value

    def anonymize_csv(self):
        """Reads and anonymizes data in chunks."""
        with open(self.input_file, "r", encoding="utf-8") as infile, \
             open(self.output_file, "w", newline="", encoding="utf-8") as outfile:
            reader = csv.reader(infile)
            writer = csv.writer(outfile)

            # Read header
            headers = next(reader)
            writer.writerow(headers)

            for row in reader:
                row[0] = self.anonymize_text(row[0])  # first_name
                row[1] = self.anonymize_text(row[1])  # last_name
                row[2] = self.anonymize_text(row[2])  # address
                writer.writerow(row)

if __name__ == "__main__":
    anonymizer = DataAnonymizer("input_data/data.csv", "output_data/anonymized_data.csv")
    anonymizer.anonymize_csv()
    print("✅ Data successfully anonymized!")
