In [12]:
try:
    import pyodbc
except ImportError:
    print("pyodbc module not found. Install it with 'pip install pyodbc'")
    exit(1)

# List all available ODBC drivers
drivers = pyodbc.drivers()

if drivers:
    print("ODBC drivers installed on your system:")
    for driver in drivers:
        print(driver)
else:
    print("No ODBC drivers found on your system.")


ODBC drivers installed on your system:
SQL Server


In [19]:
import pandas as pd
from sqlalchemy import create_engine

# Replace the connection string with your actual SQL Server details
connection_string = (
    'mssql+pyodbc://pos:pos@vnsulataxs7db01.clienttest.btmgcs.com/xcenter?driver=SQL+Server'
)

# Create an SQLAlchemy engine
engine = create_engine(connection_string)

# Read the mapping CSV file
mapping_df = pd.read_csv(r'C:\Users\trieu.pham\Downloads\Mapping.csv')

# Extract the list of tables from the "v7" column
tables = mapping_df['v7'].tolist()

template = "SELECT * FROM {table} WHERE rtl_loc_id = '51' AND {table}.business_date > '2023-05-14' ORDER BY trans_seq;"
template2 = "SELECT * FROM {table} WHERE create_date > '2023-05-14';"

# Location to export csv files
output_folder = r'C:\Users\trieu.pham\Downloads\Data Migration'
# Clear all csv files in the output folder
import os
import glob
files = glob.glob(output_folder + '/*.csv')
for f in files:
    os.remove(f)

for table in tables:
    # Try to check using the first template
    try:
        sql_query = template.format(table=table)
        df = pd.read_sql_query(sql_query, engine)
    except Exception as e:
        print(f"Failed to execute the query with the first template for table '{table}': {e}")
        # Export fail table to a CSV file in the output_folder, then skip to the next table
        csv_filename = f"{output_folder}/{table}_fail.csv"
        print("Trying the second template...")

        # Try to check using the second template
        try:
            sql_query = template2.format(table=table)
            df = pd.read_sql_query(sql_query, engine)
        except Exception as e:
            print(f"Failed to execute the query with the second template for table '{table}': {e}")
            continue

    # Export the DataFrame to a CSV file in the output_folder
    csv_filename = f"{output_folder}/{table}_v7.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Exported {table} to {csv_filename}")

# Close the database connection
engine.dispose()

print("All operations completed.")


Exported trn_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_v7.csv
Exported trn_post_void_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_post_void_trans_v7.csv
Exported trn_trans_version to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_version_v7.csv
Exported trn_poslog_data to C:\Users\trieu.pham\Downloads\Data Migration/trn_poslog_data_v7.csv
Exported trn_receipt_lookup to C:\Users\trieu.pham\Downloads\Data Migration/trn_receipt_lookup_v7.csv
Exported trn_receipt_data to C:\Users\trieu.pham\Downloads\Data Migration/trn_receipt_data_v7.csv
Exported trn_trans_link to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_link_v7.csv
Exported trn_trans_notes to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_notes_v7.csv
Exported trn_trans_properties to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_properties_v7.csv
Exported trn_no_sale_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_no_sale_trans_v7.csv
Exported trl_rtrans 

In [21]:
import pandas as pd
from sqlalchemy import create_engine

# Replace the connection string with your actual SQL Server details
connection_string = (
    'mssql+pyodbc://dtv:dtv@sulata-xu-02.cust.sgn.btmglobal.org/xcenter?driver=SQL+Server'
)

# Create an SQLAlchemy engine
engine = create_engine(connection_string)

# Read the mapping CSV file
mapping_df = pd.read_csv(r'C:\Users\trieu.pham\Downloads\Mapping.csv')

# Extract the list of tables from the "v7" column
tables = mapping_df['v23'].tolist()

template = "SELECT * FROM {table} WHERE rtl_loc_id = '51' AND {table}.business_date > '2023-05-14' AND create_user_id = 'DC_User' ORDER BY trans_seq;"
template2 = "SELECT * FROM {table} WHERE create_user_id = 'DC_User';"

# Location to export csv files
output_folder = r'C:\Users\trieu.pham\Downloads\Data Migration'

for table in tables:
    # Try to check using the first template
    try:
        sql_query = template.format(table=table)
        df = pd.read_sql_query(sql_query, engine)
    except Exception as e:
        print(f"Failed to execute the query with the first template for table '{table}': {e}")
        # Export fail table to a CSV file in the output_folder, then skip to the next table
        csv_filename = f"{output_folder}/{table}_fail.csv"
        print("Trying the second template...")

        # Try to check using the second template
        try:
            sql_query = template2.format(table=table)
            df = pd.read_sql_query(sql_query, engine)
        except Exception as e:
            print(f"Failed to execute the query with the second template for table '{table}': {e}")
            continue

    # Export the DataFrame to a CSV file in the output_folder
    csv_filename = f"{output_folder}/{table}_v23.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Exported {table} to {csv_filename}")

# Close the database connection
engine.dispose()

print("All operations completed.")


Exported trn_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_v23.csv
Exported trn_post_void_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_post_void_trans_v23.csv
Exported trn_trans_version to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_version_v23.csv
Exported trn_poslog_data to C:\Users\trieu.pham\Downloads\Data Migration/trn_poslog_data_v23.csv
Exported trn_receipt_lookup to C:\Users\trieu.pham\Downloads\Data Migration/trn_receipt_lookup_v23.csv
Exported trn_receipt_data to C:\Users\trieu.pham\Downloads\Data Migration/trn_receipt_data_v23.csv
Exported trn_trans_link to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_link_v23.csv
Exported trn_trans_notes to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_notes_v23.csv
Exported trn_trans_p to C:\Users\trieu.pham\Downloads\Data Migration/trn_trans_p_v23.csv
Exported trn_no_sale_trans to C:\Users\trieu.pham\Downloads\Data Migration/trn_no_sale_trans_v23.csv
Exported trl_rtrans to C:\Us

In [22]:
import pandas as pd
import os

# Read the mapping CSV file
mapping_df = pd.read_csv(r'C:\Users\trieu.pham\Downloads\Mapping.csv')

# Location of the exported CSV files
v7_folder = r'C:\Users\trieu.pham\Downloads\Data Migration'
v23_folder = r'C:\Users\trieu.pham\Downloads\Data Migration'

# Location to save the comparison results and failed comparisons
output_folder = r'C:\Users\trieu.pham\Downloads\Data Migration\comparison_results'
failed_folder = r'C:\Users\trieu.pham\Downloads\Data Migration\failed_queries'
nodiff_folder = r'C:\Users\trieu.pham\Downloads\Data Migration\no_diff'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
if not os.path.exists(failed_folder):
    os.makedirs(failed_folder)
if not os.path.exists(nodiff_folder):
    os.makedirs(nodiff_folder)

# Clear output folder and failed folder before starting
for file in os.listdir(output_folder):
    os.remove(os.path.join(output_folder, file))
for file in os.listdir(failed_folder):
    os.remove(os.path.join(failed_folder, file))
for file in os.listdir(nodiff_folder):
    os.remove(os.path.join(nodiff_folder, file))

# Columns to exclude from the comparison
exclude_columns = ['create_date', 'create_user_id', 'update_date', 'update_user_id']

# Function to compare two CSV files based on primary keys
def compare_csv_files(file1, file2, exclude_cols, primary_keys):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    
    exclude_cols = [col for col in exclude_cols if col not in primary_keys]
    
    df1_compare = df1.drop(columns=exclude_cols, errors='ignore')
    df2_compare = df2.drop(columns=exclude_cols, errors='ignore')
    
    # Align the columns of both DataFrames
    combined_columns = df1_compare.columns.union(df2_compare.columns)
    df1_compare = df1_compare.reindex(columns=combined_columns).sort_index(axis=1)
    df2_compare = df2_compare.reindex(columns=combined_columns).sort_index(axis=1)
    
    # Ensure both DataFrames have the same index
    df1_compare = df1_compare.reset_index(drop=True)
    df2_compare = df2_compare.reset_index(drop=True)
    
    # Merge DataFrames on primary keys to ensure aligned comparison
    merge_keys = [key for key in primary_keys if key in df1_compare.columns and key in df2_compare.columns]
    if not merge_keys:
        raise ValueError("No common primary keys found for comparison.")
    
    merged_df = pd.merge(df1_compare, df2_compare, on=merge_keys, suffixes=('_v7', '_v23'))
    
    # Separate the columns to compare
    v7_columns = [col for col in merged_df.columns if col.endswith('_v7')]
    v23_columns = [col for col in merged_df.columns if col.endswith('_v23')]
    
    # Create DataFrames for comparison
    df_v7 = merged_df[v7_columns]
    df_v23 = merged_df[v23_columns]
    
    # Remove suffixes to match columns
    df_v7.columns = df_v7.columns.str.replace('_v7', '')
    df_v23.columns = df_v23.columns.str.replace('_v23', '')
    
    # Compare the DataFrames
    comparison_result = df_v7.compare(df_v23)
    
    # Add primary keys to the comparison result for tracking if they exist and if there are differences
    if not comparison_result.empty:
        for key in merge_keys:
            comparison_result[key] = merged_df[key]
    
    return comparison_result

# Loop through the mapping and compare the respective CSV files
for index, row in mapping_df.iterrows():
    v7_table = row['v7']
    v23_table = row['v23']
    
    # Determine the primary keys based on table name
    if v7_table.startswith('xom_'):
        primary_keys = ['order_id']
    else:
        primary_keys = ['trans_seq', 'rtrans_lineitm_seq']
    
    v7_file = f"{v7_folder}/{v7_table}_v7.csv"
    v23_file = f"{v23_folder}/{v23_table}_v23.csv"
    
    try:
        comparison_result = compare_csv_files(v7_file, v23_file, exclude_columns, primary_keys)
        
        if not comparison_result.empty:
            # Save the comparison result to a CSV file
            comparison_result_file = f"{output_folder}/comparison_{v7_table}_vs_{v23_table}.csv"
            comparison_result.to_csv(comparison_result_file, index=False)
            print(f"Differences found for {v7_table} vs {v23_table} and saved to {comparison_result_file}")
        else:
            print(f"No differences found for {v7_table} vs {v23_table}")
            # Save no differences found to a CSV file
            no_diff_file = f"{nodiff_folder}/no_diff_{v7_table}_vs_{v23_table}.csv"
            with open(no_diff_file, 'w') as f:
                f.write('No differences found')
    
    except Exception as e:
        print(f"Failed to compare {v7_table} with {v23_table}: {e}")
        # Save the error details to the failed folder
        error_details = {
            'v7_table': [v7_table],
            'v23_table': [v23_table],
            'error': [str(e)]
        }
        failed_comparison_df = pd.DataFrame(error_details)
        failed_comparison_file = f"{failed_folder}/failed_comparison_{v7_table}_vs_{v23_table}.csv"
        failed_comparison_df.to_csv(failed_comparison_file, index=False)

print("Comparison completed.")


Differences found for trn_trans vs trn_trans and saved to C:\Users\trieu.pham\Downloads\Data Migration\comparison_results/comparison_trn_trans_vs_trn_trans.csv
No differences found for trn_post_void_trans vs trn_post_void_trans
No differences found for trn_trans_version vs trn_trans_version
Differences found for trn_poslog_data vs trn_poslog_data and saved to C:\Users\trieu.pham\Downloads\Data Migration\comparison_results/comparison_trn_poslog_data_vs_trn_poslog_data.csv
Differences found for trn_receipt_lookup vs trn_receipt_lookup and saved to C:\Users\trieu.pham\Downloads\Data Migration\comparison_results/comparison_trn_receipt_lookup_vs_trn_receipt_lookup.csv
Differences found for trn_receipt_data vs trn_receipt_data and saved to C:\Users\trieu.pham\Downloads\Data Migration\comparison_results/comparison_trn_receipt_data_vs_trn_receipt_data.csv
No differences found for trn_trans_link vs trn_trans_link
No differences found for trn_trans_notes vs trn_trans_notes
Differences found for 

In [23]:
import pandas as pd

def compare_csv_files(file1, file2, output_file, primary_keys, duplicates_file):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    
    # Identify and export duplicates in the primary key columns
    duplicates_df1 = df1[df1.duplicated(subset=primary_keys, keep=False)]
    duplicates_df2 = df2[df2.duplicated(subset=primary_keys, keep=False)]
    
    if not duplicates_df1.empty or not duplicates_df2.empty:
        duplicates = pd.concat([duplicates_df1, duplicates_df2])
        duplicates.to_csv(duplicates_file, index=False)
        print(f"Duplicates found and saved to {duplicates_file}")
        
    # Remove duplicates
    df1 = df1.drop_duplicates(subset=primary_keys)
    df2 = df2.drop_duplicates(subset=primary_keys)
    
    # Set primary keys as index
    df1.set_index(primary_keys, inplace=True)
    df2.set_index(primary_keys, inplace=True)
    
    # Align the columns of both DataFrames
    combined_columns = df1.columns.union(df2.columns)
    df1 = df1.reindex(columns=combined_columns)
    df2 = df2.reindex(columns=combined_columns)
    
    # Ensure both DataFrames have the same index
    combined_index = df1.index.union(df2.index)
    df1 = df1.reindex(combined_index)
    df2 = df2.reindex(combined_index)
    
    # Compare the DataFrames
    comparison_result = df1.compare(df2)
    
    # Add primary keys to the comparison result for tracking if they exist and if there are differences
    if not comparison_result.empty:
        for key in primary_keys:
            comparison_result[key] = combined_index.get_level_values(key)
    
    # Save the comparison result to a CSV file
    comparison_result.to_csv(output_file)
    print(f"Differences saved to {output_file}")

# Example usage
file1 = r'C:\Users\trieu.pham\Dropbox\Projects\data_csv\Data Migration\xom_address_mod_v7.csv'
file2 = r'C:\Users\trieu.pham\Dropbox\Projects\data_csv\Data Migration\xom_address_mod_v23.csv'
output_file = r'C:\Users\trieu.pham\Dropbox\Projects\data_csv\Data Migration\output_comparison.csv'
duplicates_file = r'C:\Users\trieu.pham\Dropbox\Projects\data_csv\Data Migration\duplicates.csv'
primary_keys = ['order_id', 'address_seq']

compare_csv_files(file1, file2, output_file, primary_keys, duplicates_file)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\trieu.pham\\Dropbox\\Projects\\data_csv\\Data Migration\\xom_address_mod_v7.csv'