# CSV比較ツール
## キー突合 + 全項目比較

### 1. ライブラリのインポート

In [None]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
import gc
from tqdm import tqdm

### 2. 設定

In [None]:
# フォルダパス
BEFORE_FOLDER = '../data/before'
AFTER_FOLDER = '../data/after'

# 大容量対応設定
CHUNK_SIZE = 5000  # チャンクサイズ（行数）
MEMORY_LIMIT_MB = 500  # メモリ制限（MB）

# 設定ファイル読み込み
def load_config(config_file='../config.txt'):
    config = {'file_settings': {}}
    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            current_section = None
            for line in f:
                line = line.strip()
                if line and not line.startswith('#'):
                    if '=' in line:
                        if ' = ' in line:
                            key, value = line.split(' = ', 1)
                        else:
                            parts = line.split('=')
                            key = parts[0].strip()
                            value = parts[1].strip() if len(parts) > 1 else ''
                        if key == 'FILE_SETTINGS':
                            current_section = 'file_settings'
                        elif key.startswith('DEFAULT_'):
                            config[key] = value
                        else:
                            config[key] = value
                    elif current_section == 'file_settings' and ':' in line:
                        parts = line.split(':')
                        if len(parts) >= 2:
                            filename = parts[0]
                            key_columns = [k.strip() for k in parts[1].split(',') if k.strip()]
                            ignore_columns = []
                            if len(parts) > 2 and parts[2]:
                                ignore_columns = [i.strip() for i in parts[2].split(';') if i.strip()]
                            config['file_settings'][filename] = {
                                'key_columns': key_columns,
                                'ignore_columns': ignore_columns
                            }
        print(f'設定ファイルを読み込みました: {config_file}')
    except FileNotFoundError:
        print(f'設定ファイルが見つかりません: {config_file}')
        config = {'DEFAULT_KEY_COLUMNS': 'id', 'DEFAULT_IGNORE_COLUMNS': '', 'file_settings': {}}
    return config

def get_file_config(filename, config):
    if filename in config['file_settings']:
        return config['file_settings'][filename]
    
    default_keys = config.get('DEFAULT_KEY_COLUMNS', 'id')
    default_ignore = config.get('DEFAULT_IGNORE_COLUMNS', '')
    
    key_columns = [k.strip() for k in default_keys.split(',') if k.strip()]
    ignore_columns = []
    if default_ignore and default_ignore not in ['', 'なし', 'NONE', 'none']:
        ignore_columns = [i.strip() for i in default_ignore.split(',') if i.strip()]
    
    return {'key_columns': key_columns, 'ignore_columns': ignore_columns}

# ファイル一覧取得
def get_csv_files(folder_path):
    path = Path(folder_path)
    return [f.name for f in path.glob('*.csv')]

before_files = get_csv_files(BEFORE_FOLDER)
after_files = get_csv_files(AFTER_FOLDER)
common_files = list(set(before_files) & set(after_files))

print(f'Beforeフォルダのファイル: {before_files}')
print(f'Afterフォルダのファイル: {after_files}')
print(f'比較対象ファイル: {common_files}')

config = load_config()

if common_files:
    print('\n設定内容:')
    for filename in common_files:
        file_config = get_file_config(filename, config)
        print(f'{filename}: キー={file_config["key_columns"]}, 無視={file_config["ignore_columns"]}')
else:
    print('比較対象ファイルがありません')

### 3. 大容量対応比較処理関数

In [None]:
def get_file_info(file_path):
    """ファイル情報を取得"""
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
    
    # 行数をカウント
    with open(file_path, 'r', encoding='utf-8') as f:
        row_count = sum(1 for _ in f) - 1  # ヘッダー除く
    
    return {'size_mb': file_size, 'rows': row_count}

def compare_csv_large(baseline_path, candidate_path, key_columns, ignore_columns=None):
    """大容量ファイル対応の比較処理"""
    if ignore_columns is None:
        ignore_columns = []
    
    print("ファイル情報を取得中...")
    baseline_info = get_file_info(baseline_path)
    candidate_info = get_file_info(candidate_path)
    
    print(f"Baseline: {baseline_info['rows']:,}行, {baseline_info['size_mb']:.1f}MB")
    print(f"Candidate: {candidate_info['rows']:,}行, {candidate_info['size_mb']:.1f}MB")
    
    total_size = baseline_info['size_mb'] + candidate_info['size_mb']
    
    if total_size > MEMORY_LIMIT_MB:
        print(f"大容量ファイル検出（{total_size:.1f}MB > {MEMORY_LIMIT_MB}MB）")
        print("チャンク処理を実行します...")
        return compare_csv_chunked(baseline_path, candidate_path, key_columns, ignore_columns)
    else:
        print("通常処理を実行します...")
        return compare_csv_normal(baseline_path, candidate_path, key_columns, ignore_columns)

def compare_csv_normal(baseline_path, candidate_path, key_columns, ignore_columns):
    """通常の比較処理"""
    baseline_df = pd.read_csv(baseline_path)
    candidate_df = pd.read_csv(candidate_path)
    
    return process_comparison(baseline_df, candidate_df, key_columns, ignore_columns)

def compare_csv_chunked(baseline_path, candidate_path, key_columns, ignore_columns):
    """チャンク処理による比較"""
    # キー列のみでインデックス作成
    print("キーインデックスを作成中...")
    baseline_keys = create_key_index(baseline_path, key_columns)
    candidate_keys = create_key_index(candidate_path, key_columns)
    
    print(f"Baselineキー数: {len(baseline_keys):,}")
    print(f"Candidateキー数: {len(candidate_keys):,}")
    
    # 差分キーを特定
    deleted_keys = baseline_keys - candidate_keys
    added_keys = candidate_keys - baseline_keys
    common_keys = baseline_keys & candidate_keys
    
    print(f"削除: {len(deleted_keys):,}, 追加: {len(added_keys):,}, 共通: {len(common_keys):,}")
    
    diff_records = []
    
    # 削除・追加レコード
    for key in deleted_keys:
        diff_records.append({
            'key': key,
            'diff_type': 'DELETED',
            'column': None,
            'baseline_value': None,
            'candidate_value': None
        })
    
    for key in added_keys:
        diff_records.append({
            'key': key,
            'diff_type': 'ADDED',
            'column': None,
            'baseline_value': None,
            'candidate_value': None
        })
    
    # 共通キーの詳細比較（チャンク処理）
    if common_keys:
        print("詳細比較を実行中...")
        modified_records = compare_common_keys_chunked(
            baseline_path, candidate_path, common_keys, key_columns, ignore_columns
        )
        diff_records.extend(modified_records)
    
    return pd.DataFrame(diff_records)

def create_key_index(file_path, key_columns):
    """キーインデックスを作成"""
    keys = set()
    
    for chunk in pd.read_csv(file_path, chunksize=CHUNK_SIZE, usecols=key_columns):
        if len(key_columns) > 1:
            chunk_keys = chunk[key_columns].astype(str).agg('_'.join, axis=1)
        else:
            chunk_keys = chunk[key_columns[0]].astype(str)
        
        keys.update(chunk_keys.tolist())
        gc.collect()
    
    return keys

def compare_common_keys_chunked(baseline_path, candidate_path, common_keys, key_columns, ignore_columns):
    """共通キーの詳細比較（チャンク処理）"""
    modified_records = []
    processed_keys = set()
    
    # Baselineをチャンクで読み込み
    baseline_chunks = pd.read_csv(baseline_path, chunksize=CHUNK_SIZE)
    
    for baseline_chunk in tqdm(baseline_chunks, desc="比較処理"):
        # キー作成
        if len(key_columns) > 1:
            baseline_chunk['_key'] = baseline_chunk[key_columns].astype(str).agg('_'.join, axis=1)
        else:
            baseline_chunk['_key'] = baseline_chunk[key_columns[0]].astype(str)
        
        # 共通キーのみ抽出
        baseline_common = baseline_chunk[baseline_chunk['_key'].isin(common_keys)]
        
        if len(baseline_common) == 0:
            continue
        
        # 対応するCandidateデータを取得
        target_keys = baseline_common['_key'].tolist()
        candidate_data = get_candidate_data_by_keys(candidate_path, target_keys, key_columns)
        
        # 詳細比較
        chunk_diffs = compare_chunk_details(baseline_common, candidate_data, key_columns, ignore_columns)
        modified_records.extend(chunk_diffs)
        
        processed_keys.update(target_keys)
        gc.collect()
    
    return modified_records

def get_candidate_data_by_keys(candidate_path, target_keys, key_columns):
    """指定キーのCandidateデータを取得"""
    candidate_data = []
    
    for chunk in pd.read_csv(candidate_path, chunksize=CHUNK_SIZE):
        if len(key_columns) > 1:
            chunk['_key'] = chunk[key_columns].astype(str).agg('_'.join, axis=1)
        else:
            chunk['_key'] = chunk[key_columns[0]].astype(str)
        
        matching_rows = chunk[chunk['_key'].isin(target_keys)]
        if len(matching_rows) > 0:
            candidate_data.append(matching_rows)
    
    return pd.concat(candidate_data, ignore_index=True) if candidate_data else pd.DataFrame()

def compare_chunk_details(baseline_chunk, candidate_chunk, key_columns, ignore_columns):
    """チャンク内の詳細比較"""
    if len(candidate_chunk) == 0:
        return []
    
    baseline_indexed = baseline_chunk.set_index('_key')
    candidate_indexed = candidate_chunk.set_index('_key')
    
    compare_columns = [col for col in baseline_chunk.columns 
                      if col not in key_columns and col not in ignore_columns and col != '_key']
    
    diff_records = []
    
    for key in baseline_indexed.index:
        if key not in candidate_indexed.index:
            continue
        
        baseline_row = baseline_indexed.loc[key]
        candidate_row = candidate_indexed.loc[key]
        
        for col in compare_columns:
            baseline_val = baseline_row[col]
            candidate_val = candidate_row[col]
            
            if pd.isna(baseline_val) and pd.isna(candidate_val):
                continue
            
            if baseline_val != candidate_val:
                diff_records.append({
                    'key': key,
                    'diff_type': 'MODIFIED',
                    'column': col,
                    'baseline_value': baseline_val,
                    'candidate_value': candidate_val
                })
    
    return diff_records

def process_comparison(baseline_df, candidate_df, key_columns, ignore_columns):
    """通常の比較処理（既存ロジック）"""
    if len(key_columns) > 1:
        baseline_df = baseline_df.copy()
        candidate_df = candidate_df.copy()
        combined_key = '_'.join(key_columns)
        baseline_df[combined_key] = baseline_df[key_columns].astype(str).agg('_'.join, axis=1)
        candidate_df[combined_key] = candidate_df[key_columns].astype(str).agg('_'.join, axis=1)
        key_column = combined_key
    else:
        key_column = key_columns[0]
    
    baseline_indexed = baseline_df.set_index(key_column)
    candidate_indexed = candidate_df.set_index(key_column)
    
    compare_columns = [col for col in baseline_df.columns 
                      if col not in key_columns and col not in ignore_columns and col != key_column]
    
    diff_records = []
    
    # 削除・追加・変更の検出
    missing_keys = baseline_indexed.index.difference(candidate_indexed.index)
    extra_keys = candidate_indexed.index.difference(baseline_indexed.index)
    common_keys = baseline_indexed.index.intersection(candidate_indexed.index)
    
    for key in missing_keys:
        diff_records.append({
            'key': key, 'diff_type': 'DELETED', 'column': None,
            'baseline_value': None, 'candidate_value': None
        })
    
    for key in extra_keys:
        diff_records.append({
            'key': key, 'diff_type': 'ADDED', 'column': None,
            'baseline_value': None, 'candidate_value': None
        })
    
    for key in common_keys:
        baseline_row = baseline_indexed.loc[key]
        candidate_row = candidate_indexed.loc[key]
        
        for col in compare_columns:
            baseline_val = baseline_row[col]
            candidate_val = candidate_row[col]
            
            if pd.isna(baseline_val) and pd.isna(candidate_val):
                continue
            
            if baseline_val != candidate_val:
                diff_records.append({
                    'key': key, 'diff_type': 'MODIFIED', 'column': col,
                    'baseline_value': baseline_val, 'candidate_value': candidate_val
                })
    
    return pd.DataFrame(diff_records)

### 4. ファイル比較実行

In [None]:
def compare_files_large(filename):
    """大容量対応ファイル比較"""
    before_path = os.path.join(BEFORE_FOLDER, filename)
    after_path = os.path.join(AFTER_FOLDER, filename)
    
    print(f'\n=== {filename} の比較（大容量対応） ===')
    
    file_config = get_file_config(filename, config)
    
    start_time = datetime.now()
    diff_df = compare_csv_large(before_path, after_path, file_config['key_columns'], file_config['ignore_columns'])
    end_time = datetime.now()
    
    processing_time = (end_time - start_time).total_seconds()
    print(f'処理時間: {processing_time:.1f}秒')
    print(f'検出された差分: {len(diff_df):,} 件')
    
    if len(diff_df) > 0:
        # サマリー表示
        print('\n差分サマリー:')
        summary = diff_df['diff_type'].value_counts()
        for diff_type, count in summary.items():
            print(f'  {diff_type}: {count:,}件')
        
        # 最初の10件を表示
        print('\n差分サンプル（最初の10件）:')
        display(diff_df.head(10))
        
        # レポート出力
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'{filename.replace(".csv", "")}_diff_{timestamp}.csv'
        output_path = f'../output/{output_filename}'
        
        print(f'\n差分レポートを出力中... ({len(diff_df):,}件)')
        diff_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        
        # 統計レポート出力
        stats_filename = f'{filename.replace(".csv", "")}_stats_{timestamp}.txt'
        stats_path = f'../output/{stats_filename}'
        
        with open(stats_path, 'w', encoding='utf-8') as f:
            f.write(f'=== {filename} 比較統計レポート ===\n\n')
            f.write(f'処理時間: {processing_time:.1f}秒\n')
            f.write(f'総差分件数: {len(diff_df):,}件\n\n')
            f.write('差分タイプ別件数:\n')
            for diff_type, count in summary.items():
                f.write(f'  {diff_type}: {count:,}件\n')
            
            if 'MODIFIED' in summary:
                modified_df = diff_df[diff_df['diff_type'] == 'MODIFIED']
                column_counts = modified_df['column'].value_counts()
                f.write('\n変更項目別件数:\n')
                for col, count in column_counts.head(10).items():
                    f.write(f'  {col}: {count:,}件\n')
        
        print(f'差分レポート: {output_path}')
        print(f'統計レポート: {stats_path}')
    else:
        print('差分はありません')
    
    # メモリクリア
    del diff_df
    gc.collect()
    
    return processing_time

# 全ファイル比較実行
total_start = datetime.now()
processing_times = {}

for filename in common_files:
    processing_time = compare_files_large(filename)
    processing_times[filename] = processing_time

total_end = datetime.now()
total_time = (total_end - total_start).total_seconds()

print(f'\n=== 全体処理完了 ===')
print(f'総処理時間: {total_time:.1f}秒')
for filename, time in processing_times.items():
    print(f'{filename}: {time:.1f}秒')