# CSV比較ツール
## キー突合 + 全項目比較

### 1. ライブラリのインポート

In [6]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path

### 2. 設定

In [7]:
# フォルダパス
BEFORE_FOLDER = '../data/before'
AFTER_FOLDER = '../data/after'

# 設定ファイル読み込み
def load_config(config_file='../config.txt'):
    config = {'file_settings': {}}
    try:
        with open(config_file, 'r', encoding='utf-8') as f:
            current_section = None
            for line in f:
                line = line.strip()
                if line and not line.startswith('#'):
                    if '=' in line:
                        if ' = ' in line:
                            key, value = line.split(' = ', 1)
                        else:
                            parts = line.split('=')
                            key = parts[0].strip()
                            value = parts[1].strip() if len(parts) > 1 else ''
                        if key == 'FILE_SETTINGS':
                            current_section = 'file_settings'
                        elif key.startswith('DEFAULT_'):
                            config[key] = value
                        else:
                            config[key] = value
                    elif current_section == 'file_settings' and ':' in line:
                        parts = line.split(':')
                        if len(parts) >= 2:
                            filename = parts[0]
                            key_columns = [k.strip() for k in parts[1].split(',') if k.strip()]
                            ignore_columns = []
                            if len(parts) > 2 and parts[2]:
                                ignore_columns = [i.strip() for i in parts[2].split(';') if i.strip()]
                            config['file_settings'][filename] = {
                                'key_columns': key_columns,
                                'ignore_columns': ignore_columns
                            }
        print(f'設定ファイルを読み込みました: {config_file}')
    except FileNotFoundError:
        print(f'設定ファイルが見つかりません: {config_file}')
        print('デフォルト設定を使用します')
        config = {'DEFAULT_KEY_COLUMNS': 'id', 'DEFAULT_IGNORE_COLUMNS': '', 'file_settings': {}}
    return config

def get_file_config(filename, config):
    if filename in config['file_settings']:
        return config['file_settings'][filename]
    
    default_keys = config.get('DEFAULT_KEY_COLUMNS', 'id')
    default_ignore = config.get('DEFAULT_IGNORE_COLUMNS', '')
    
    key_columns = [k.strip() for k in default_keys.split(',') if k.strip()]
    ignore_columns = []
    if default_ignore and default_ignore not in ['', 'なし', 'NONE', 'none']:
        ignore_columns = [i.strip() for i in default_ignore.split(',') if i.strip()]
    
    return {'key_columns': key_columns, 'ignore_columns': ignore_columns}

# ファイル一覧取得
def get_csv_files(folder_path):
    path = Path(folder_path)
    return [f.name for f in path.glob('*.csv')]

before_files = get_csv_files(BEFORE_FOLDER)
after_files = get_csv_files(AFTER_FOLDER)
common_files = list(set(before_files) & set(after_files))

print(f'Beforeフォルダのファイル: {before_files}')
print(f'Afterフォルダのファイル: {after_files}')
print(f'比較対象ファイル: {common_files}')

# 設定読み込み
config = load_config()

if common_files:
    print('\n設定内容:')
    for filename in common_files:
        file_config = get_file_config(filename, config)
        print(f'{filename}: キー={file_config["key_columns"]}, 無視={file_config["ignore_columns"]}')
else:
    print('比較対象ファイルがありません')

Beforeフォルダのファイル: ['sample.csv', 'tccontract_20251217.csv']
Afterフォルダのファイル: ['sample.csv', 'tccontract_20251217.csv']
比較対象ファイル: ['sample.csv', 'tccontract_20251217.csv']
設定ファイルを読み込みました: ../config.txt

設定内容:
sample.csv: キー=['id'], 無視=['updated_at']
tccontract_20251217.csv: キー=['vin'], 無視=['updated_at']


### 3. 比較処理関数

In [8]:
def compare_csv(baseline_df, candidate_df, key_columns, ignore_columns=None):
    if ignore_columns is None:
        ignore_columns = []
    
    # 複数キーの場合は結合キーを作成
    if len(key_columns) > 1:
        baseline_df = baseline_df.copy()
        candidate_df = candidate_df.copy()
        combined_key = '_'.join(key_columns)
        baseline_df[combined_key] = baseline_df[key_columns].astype(str).agg('_'.join, axis=1)
        candidate_df[combined_key] = candidate_df[key_columns].astype(str).agg('_'.join, axis=1)
        key_column = combined_key
    else:
        key_column = key_columns[0]
    
    baseline_indexed = baseline_df.set_index(key_column)
    candidate_indexed = candidate_df.set_index(key_column)
    
    compare_columns = [col for col in baseline_df.columns 
                      if col not in key_columns and col not in ignore_columns and col != key_column]
    
    diff_records = []
    
    # 削除された行
    missing_keys = baseline_indexed.index.difference(candidate_indexed.index)
    for key in missing_keys:
        diff_records.append({
            'key': key,
            'diff_type': 'DELETED',
            'column': None,
            'baseline_value': None,
            'candidate_value': None
        })
    
    # 追加された行
    extra_keys = candidate_indexed.index.difference(baseline_indexed.index)
    for key in extra_keys:
        diff_records.append({
            'key': key,
            'diff_type': 'ADDED',
            'column': None,
            'baseline_value': None,
            'candidate_value': None
        })
    
    # 変更された項目
    common_keys = baseline_indexed.index.intersection(candidate_indexed.index)
    for key in common_keys:
        baseline_row = baseline_indexed.loc[key]
        candidate_row = candidate_indexed.loc[key]
        
        for col in compare_columns:
            baseline_val = baseline_row[col]
            candidate_val = candidate_row[col]
            
            if pd.isna(baseline_val) and pd.isna(candidate_val):
                continue
            
            if baseline_val != candidate_val:
                diff_records.append({
                    'key': key,
                    'diff_type': 'MODIFIED',
                    'column': col,
                    'baseline_value': baseline_val,
                    'candidate_value': candidate_val
                })
    
    return pd.DataFrame(diff_records)

### 4. ID別詳細差分表示関数

In [9]:
def display_diff_by_id(diff_df, baseline_df, candidate_df, key_columns, ignore_columns=None):
    if ignore_columns is None:
        ignore_columns = []
    
    if len(diff_df) == 0:
        print('差分はありません')
        return
    
    # 複数キーの場合は結合キーを作成
    if len(key_columns) > 1:
        baseline_df = baseline_df.copy()
        candidate_df = candidate_df.copy()
        combined_key = '_'.join(key_columns)
        baseline_df[combined_key] = baseline_df[key_columns].astype(str).agg('_'.join, axis=1)
        candidate_df[combined_key] = candidate_df[key_columns].astype(str).agg('_'.join, axis=1)
        key_column = combined_key
        key_display = '+'.join(key_columns)
    else:
        key_column = key_columns[0]
        key_display = key_column
    
    baseline_indexed = baseline_df.set_index(key_column)
    candidate_indexed = candidate_df.set_index(key_column)
    
    for key in sorted(diff_df['key'].unique()):
        key_diffs = diff_df[diff_df['key'] == key]
        diff_type = key_diffs['diff_type'].iloc[0]
        
        print(f'\n{key_display}: {key}')
        print('=' * 40)
        
        if diff_type == 'DELETED':
            baseline_row = baseline_indexed.loc[key]
            baseline_values = [str(baseline_row[col]) for col in baseline_row.index 
                             if col not in ignore_columns and col not in key_columns and col != key_column]
            print(f'(前回の状態) {", ".join(baseline_values)}')
            print('(今回の状態) ---')
            
        elif diff_type == 'ADDED':
            candidate_row = candidate_indexed.loc[key]
            candidate_values = [str(candidate_row[col]) for col in candidate_row.index 
                              if col not in ignore_columns and col not in key_columns and col != key_column]
            print('(前回の状態) ---')
            print(f'(今回の状態) {", ".join(candidate_values)}')
            
        elif diff_type == 'MODIFIED':
            baseline_row = baseline_indexed.loc[key]
            candidate_row = candidate_indexed.loc[key]
            
            baseline_values = [str(baseline_row[col]) for col in baseline_row.index 
                             if col not in ignore_columns and col not in key_columns and col != key_column]
            candidate_values = [str(candidate_row[col]) for col in candidate_row.index 
                              if col not in ignore_columns and col not in key_columns and col != key_column]
            
            print(f'(前回の状態) {", ".join(baseline_values)}')
            print(f'(今回の状態) {", ".join(candidate_values)}')
            
            changed_columns = key_diffs['column'].tolist()
            print(f'変更された項目: {", ".join(changed_columns)}')

### 5. ファイル比較実行

In [10]:
def compare_files(filename):
    before_path = os.path.join(BEFORE_FOLDER, filename)
    after_path = os.path.join(AFTER_FOLDER, filename)
    
    print(f'\n=== {filename} の比較 ===')
    
    baseline_df = pd.read_csv(before_path)
    candidate_df = pd.read_csv(after_path)
    
    print(f'Before: {len(baseline_df)} 行')
    print(f'After: {len(candidate_df)} 行')
    
    file_config = get_file_config(filename, config)
    diff_df = compare_csv(baseline_df, candidate_df, file_config['key_columns'], file_config['ignore_columns'])
    
    print(f'検出された差分: {len(diff_df)} 件')
    
    if len(diff_df) > 0:
        display(diff_df)
        
        display_diff_by_id(diff_df, baseline_df, candidate_df, file_config['key_columns'], file_config['ignore_columns'])
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_filename = f'{filename.replace(".csv", "")}_diff_{timestamp}.csv'
        output_path = f'../output/{output_filename}'
        diff_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        
        detail_filename = f'{filename.replace(".csv", "")}_detail_{timestamp}.txt'
        detail_path = f'../output/{detail_filename}'
        
        with open(detail_path, 'w', encoding='utf-8') as f:
            f.write(f'=== {filename} ID別詳細差分レポート ===\n\n')
            
            key_columns = file_config['key_columns']
            ignore_columns = file_config['ignore_columns']
            
            if len(key_columns) > 1:
                baseline_df_copy = baseline_df.copy()
                candidate_df_copy = candidate_df.copy()
                combined_key = '_'.join(key_columns)
                baseline_df_copy[combined_key] = baseline_df_copy[key_columns].astype(str).agg('_'.join, axis=1)
                candidate_df_copy[combined_key] = candidate_df_copy[key_columns].astype(str).agg('_'.join, axis=1)
                key_column = combined_key
                key_display = '+'.join(key_columns)
                baseline_indexed = baseline_df_copy.set_index(key_column)
                candidate_indexed = candidate_df_copy.set_index(key_column)
            else:
                key_column = key_columns[0]
                key_display = key_column
                baseline_indexed = baseline_df.set_index(key_column)
                candidate_indexed = candidate_df.set_index(key_column)
            
            for key in sorted(diff_df['key'].unique()):
                key_diffs = diff_df[diff_df['key'] == key]
                diff_type = key_diffs['diff_type'].iloc[0]
                
                f.write(f'{key_display}: {key}\n')
                f.write('=' * 40 + '\n')
                
                if diff_type == 'DELETED':
                    baseline_row = baseline_indexed.loc[key]
                    baseline_values = [str(baseline_row[col]) for col in baseline_row.index 
                                     if col not in ignore_columns and col not in key_columns and col != key_column]
                    f.write(f'(前回の状態) {", ".join(baseline_values)}\n')
                    f.write('(今回の状態) ---\n\n')
                    
                elif diff_type == 'ADDED':
                    candidate_row = candidate_indexed.loc[key]
                    candidate_values = [str(candidate_row[col]) for col in candidate_row.index 
                                      if col not in ignore_columns and col not in key_columns and col != key_column]
                    f.write('(前回の状態) ---\n')
                    f.write(f'(今回の状態) {", ".join(candidate_values)}\n\n')
                    
                elif diff_type == 'MODIFIED':
                    baseline_row = baseline_indexed.loc[key]
                    candidate_row = candidate_indexed.loc[key]
                    
                    baseline_values = [str(baseline_row[col]) for col in baseline_row.index 
                                     if col not in ignore_columns and col not in key_columns and col != key_column]
                    candidate_values = [str(candidate_row[col]) for col in candidate_row.index 
                                      if col not in ignore_columns and col not in key_columns and col != key_column]
                    
                    f.write(f'(前回の状態) {", ".join(baseline_values)}\n')
                    f.write(f'(今回の状態) {", ".join(candidate_values)}\n')
                    
                    changed_columns = key_diffs['column'].tolist()
                    f.write(f'変更された項目: {", ".join(changed_columns)}\n\n')
        
        print(f'差分レポートを出力しました: {output_path}')
        print(f'詳細レポートを出力しました: {detail_path}')
    else:
        print('差分はありません')
    
    return diff_df, baseline_df, candidate_df

# 全ファイル比較実行
all_diffs = {}
for filename in common_files:
    diff_df, baseline_df, candidate_df = compare_files(filename)
    all_diffs[filename] = {
        'diff_df': diff_df,
        'baseline_df': baseline_df,
        'candidate_df': candidate_df
    }


=== sample.csv の比較 ===
Before: 4 行
After: 4 行
検出された差分: 5 件


Unnamed: 0,key,diff_type,column,baseline_value,candidate_value
0,4,DELETED,,,
1,5,ADDED,,,
2,1,MODIFIED,age,30,31
3,1,MODIFIED,salary,5000000,5200000
4,3,MODIFIED,department,人事,経理



id: 1
(前回の状態) 田中太郎, 30, 営業, 5000000
(今回の状態) 田中太郎, 31, 営業, 5200000
変更された項目: age, salary

id: 3
(前回の状態) 鈴木次郎, 35, 人事, 5500000
(今回の状態) 鈴木次郎, 35, 経理, 5500000
変更された項目: department

id: 4
(前回の状態) 高橋美咲, 28, 営業, 4800000
(今回の状態) ---

id: 5
(前回の状態) ---
(今回の状態) 山田一郎, 40, 開発, 6000000
差分レポートを出力しました: ../output/sample_diff_20251222_082340.csv
詳細レポートを出力しました: ../output/sample_detail_20251222_082340.txt

=== tccontract_20251217.csv の比較 ===
Before: 17406 行
After: 17406 行
検出された差分: 4 件


Unnamed: 0,key,diff_type,column,baseline_value,candidate_value
0,PHY0620-0014890,DELETED,,,
1,PHY0620-0013594,ADDED,,,
2,PHY0620-0014884,MODIFIED,diffflag,111111111.0,111111110
3,PHY0620-0012225,MODIFIED,contractstartdatetime,,test



vin: PHY0620-0012225
(前回の状態) 2025-12-17, nan, nan, nan, nan, nan, nan, nan, nan, nan, 111111111
(今回の状態) 2025-12-17, nan, nan, nan, test, nan, nan, nan, nan, nan, 111111111
変更された項目: contractstartdatetime

vin: PHY0620-0013594
(前回の状態) ---
(今回の状態) 2025-12-17, nan, nan, nan, nan, nan, nan, nan, nan, nan, 111111111

vin: PHY0620-0014884
(前回の状態) 2025-12-17, nan, nan, nan, nan, nan, nan, nan, nan, nan, 111111111
(今回の状態) 2025-12-17, nan, nan, nan, nan, nan, nan, nan, nan, nan, 111111110
変更された項目: diffflag

vin: PHY0620-0014890
(前回の状態) 2025-12-17, nan, nan, nan, nan, nan, nan, nan, nan, nan, 111111111
(今回の状態) ---
差分レポートを出力しました: ../output/tccontract_20251217_diff_20251222_082343.csv
詳細レポートを出力しました: ../output/tccontract_20251217_detail_20251222_082343.txt
