In [13]:
import pandas as pd
import csv
import sys
import os

# Increase CSV field size limit
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

def merge_csv_files():
    # File paths
    base_path = r"C:\Users\Siddhant Nijhawan\Downloads\NEST\Problem Statements and Data Sets"
    output_path = r"C:\Users\Siddhant Nijhawan\Downloads\NEST\Output"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # File configurations
    files = [
        ('usecase_3_.csv', ','),
        ('drop_withdrawals.txt', '|'),
        ('eligibilities.txt', '|'),
        ('facilities.txt', '|'),
        ('reported_events.txt', '|')
    ]
    
    # Initialize an empty DataFrame
    merged_df = pd.DataFrame()
    
    try:
        for filename, separator in files:
            print(f"\nProcessing {filename}...")
            file_path = os.path.join(base_path, filename)
            
            # Read the current file
            current_df = pd.read_csv(
                file_path,
                sep=separator,
                encoding='utf-8',
                on_bad_lines='warn',
                dtype=str  # Read all columns as string to avoid dtype issues
            )
            
            print(f"Columns from {filename}: {len(current_df.columns)}")
            
            # For the first file, initialize merged_df
            if merged_df.empty:
                merged_df = current_df
            else:
                # Add new columns to merged_df
                for col in current_df.columns:
                    if col not in merged_df.columns:
                        merged_df[col] = current_df[col]
            
            print(f"Total columns after merging: {len(merged_df.columns)}")
            del current_df  # Free up memory
        
        # Save the final merged DataFrame
        output_file = os.path.join(output_path, 'merged_all.csv')
        print(f"\nSaving to {output_file}...")
        
        merged_df.to_csv(
            output_file,
            index=False,
            encoding='utf-8',
            quoting=csv.QUOTE_ALL,
            escapechar='\\'
        )
        
        print(f"\nProcessing completed! Output saved to: {output_file}")
        print(f"Final dimensions: {merged_df.shape}")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    merge_csv_files()


Processing usecase_3_.csv...
Columns from usecase_3_.csv: 32
Total columns after merging: 32

Processing drop_withdrawals.txt...
Columns from drop_withdrawals.txt: 10
Total columns after merging: 42

Processing eligibilities.txt...
Columns from eligibilities.txt: 14
Total columns after merging: 54

Processing facilities.txt...
Columns from facilities.txt: 8
Total columns after merging: 60

Processing reported_events.txt...
Columns from reported_events.txt: 17
Total columns after merging: 73

Saving to C:\Users\Siddhant Nijhawan\Downloads\NEST\Output\merged_all.csv...

Processing completed! Output saved to: C:\Users\Siddhant Nijhawan\Downloads\NEST\Output\merged_all.csv
Final dimensions: (257577, 73)
