README
>>>>>Generate Mock Data Files based on schema
>>>>>Generate Master Files Party, Account with record count greater than the mock data files
>>>>>Replace Party , Account Number Details with Master Files to enforce RI if needed


In [None]:
#REPLACE PARTY,ACCOUNT WITH MASTER FOR JOINING#

import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Function to browse and select a file
def browse_file(prompt):
    Tk().withdraw()  # We don't want a full GUI, so keep the root window from appearing
    filename = askopenfilename(title=prompt)
    return filename

# Get user inputs for source file, target file, and column name
source = browse_file("Select the source CSV file")
target = browse_file("Select the target CSV file")

key_column = input("Enter the column name to check for duplicates (e.g., 'ACCT_NUM'): ").lower()

# Load the two CSV files with the specified column as a string
df_source = pd.read_csv(source, dtype={key_column: str})
df_target = pd.read_csv(target, dtype={key_column: str})

# Convert column names to lowercase
df_source.columns = df_source.columns.str.lower()
df_target.columns = df_target.columns.str.lower()

# Remove duplicates based on the key column
df_unique_source = df_source.drop_duplicates(subset=[key_column])
df_unique_target = df_target.drop_duplicates(subset=[key_column])

# Replace the column values only if target count is less than source count
if len(df_unique_target) <= len(df_unique_source):
    df_unique_target[key_column] = df_unique_source[key_column].values[:len(df_unique_target)]
else:
    raise ValueError("Target count is greater than source count. Replacement not possible.")


# Save the updated DataFrame back to a CSV file
output_file = 'updated_' + target.split('/')[-1]
df_unique_target.to_csv(output_file, index=False)

print(f"Column values replaced successfully! The updated file is saved as '{output_file}'")


In [90]:
##VALIDATOR##
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Function to browse and select a file
def browse_file(prompt):
    Tk().withdraw()  # We don't want a full GUI, so keep the root window from appearing
    filename = askopenfilename(title=prompt)
    return filename

# Get user inputs for the three files and column names
party_file = browse_file("Select the party CSV file")
party_account_file = browse_file("Select the party account CSV file")
#account_file = browse_file("Select the account CSV file")

party_key_column = input("Enter the column name to join party and party account files on (e.g., 'party_num'): ").lower()
#account_key_column = input("Enter the column name to join party account and account files on (e.g., 'acct_num'): ").lower()

# Load the three CSV files with the specified columns as strings
df_party = pd.read_csv(party_file, dtype={party_key_column: str})
df_party_account = pd.read_csv(party_account_file, dtype={party_key_column: str, account_key_column: str})
#df_account = pd.read_csv(account_file, dtype={account_key_column: str})

# Convert column names to lowercase
df_party.columns = df_party.columns.str.lower()
df_party_account.columns = df_party_account.columns.str.lower()
df_account.columns = df_account.columns.str.lower()

# Perform inner join on the key columns
df_merged1 = df_party.merge(df_party_account, on=party_key_column)
df_merged2 = df_merged1.merge(df_account, on=account_key_column)

# Show the output
print("Merged DataFrame:")
print(df_merged2)



Merged DataFrame:
             party_num          party_name address_line    acct_num  \
0   459687844127169650          James Cole      unknown  6276363016   
1   242366242910176282         Lee Collins      unknown  3386081460   
2   355033838990046324        Randy Garcia      unknown  8843539658   
3   753504545835546895      Matthew Norris      unknown  8918063362   
4   205372666969557920      Kelsey Roberts      unknown  4566468088   
5   845669279428142160        Joseph Davis      unknown  8227832189   
6   345050757437515908      Bonnie Johnson      unknown  5684371681   
7   205036380121880285      Jennifer Moody      unknown  1780784924   
8   619806098608840618        Patrick Howe      unknown  3178535577   
9   354506827549611183    Kimberly Osborne      unknown  4488710451   
10  363881710539249517  Katherine Martinez      unknown  8465899971   
11  794815621605814551    Kristopher Ayala      unknown  6148855403   
12  576739727161994012   Kimberly Williams      unknown  67

In [None]:
##GENERATE TEST DATA###
import pandas as pd
import numpy as np
import glob
import random
import string
from faker import Faker

faker = Faker()

# Function to generate random string of a given length
def random_string(length):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Function to generate random integer with a specific number of digits
def random_integer(min_digits, max_digits):
    min_value = 10**(min_digits - 1)
    max_value = 10**max_digits - 1
    return np.random.randint(min_value, max_value + 1, dtype=np.int64)

# Function to generate unique IDs
def generate_unique_ids(num_rows):
    return list(range(1, num_rows + 1))

# Function to generate data based on schema
def generate_data(schema, num_rows):
    data = {}
    for index, row in schema.iterrows():
        col = row[0]
        dtype_info = row[1]
        is_primary_key = len(row) > 2 and row[2] == 'primary_key'
        dtype_parts = dtype_info.split(':')
        dtype = dtype_parts[0]
        min_length = int(dtype_parts[1]) if len(dtype_parts) > 1 else None
        max_length = int(dtype_parts[2]) if len(dtype_parts) > 2 else None
        permissible_values = dtype_parts[3].split('|') if len(dtype_parts) > 3 else None
        
        if is_primary_key:
            data[col] = generate_unique_ids(num_rows)
        elif permissible_values:
            data[col] = np.random.choice(permissible_values, num_rows)
        elif dtype == 'int':
            if min_length and max_length:
                data[col] = [random_integer(min_length, max_length) for _ in range(num_rows)]
            else:
                data[col] = np.random.randint(1, 100, num_rows)
        elif dtype == 'float':
            data[col] = np.random.rand(num_rows) * 100
        elif dtype == 'name':
            data[col] = [faker.name() for _ in range(num_rows)]
        elif dtype == 'date':
            data[col] = [faker.date() for _ in range(num_rows)]
        elif dtype == 'datetime':
            data[col] = [faker.date_time().isoformat() for _ in range(num_rows)]
        elif dtype == 'bool_1_0':
            data[col] = np.random.choice([1, 0], num_rows)
        elif dtype == 'str':
            if min_length and max_length:
                lengths = np.random.randint(min_length, max_length + 1, num_rows)
                data[col] = [random_string(length) for length in lengths]
            elif min_length:
                data[col] = [random_string(min_length) for _ in range(num_rows)]
            else:
                data[col] = np.random.choice(['A', 'B', 'C'], num_rows)
        else:
            data[col] = np.random.choice(['unknown'], num_rows)
    return pd.DataFrame(data)

# Load all transposed schema files
schema_files = glob.glob('schema_*.csv')  # Adjust the pattern if needed
# Generate and save test data for each schema file
num_rows = 50
for schema_file in schema_files:
    schema = pd.read_csv(schema_file, header=None)
    test_data = generate_data(schema, num_rows)
    output_file = schema_file.replace('schema_', 'test_data_')  # Create corresponding output file name
    test_data.to_csv(output_file, index=False)
    print(f"Test data generated and saved to '{output_file}'")


In [None]:
##GENERATE MASTER DATA###
import pandas as pd
import numpy as np

# Function to generate unique 18-digit party numbers
def generate_unique_party_numbers(num_rows):
    unique_numbers = set()
    while len(unique_numbers) < num_rows:
        unique_numbers.add(str(np.random.randint(10**17, 10**18, dtype=np.int64)))
    return list(unique_numbers)

# Function to generate unique 10-digit account numbers
def generate_unique_account_numbers(num_rows):
    unique_numbers = set()
    while len(unique_numbers) < num_rows:
        unique_numbers.add(str(np.random.randint(10**9, 10**10, dtype=np.int64)))
    return list(unique_numbers)

# Function to define roles
def define_roles(num_rows):
    roles = ['Primary', 'Secondary']
    return np.random.choice(roles, num_rows)

# Number of rows for each file
num_rows_account = 50
num_rows_party = num_rows_account * 2

# Generate data
party_numbers = generate_unique_party_numbers(num_rows_party)
account_numbers = generate_unique_account_numbers(num_rows_account)
roles = define_roles(num_rows_account)

# Create DataFrames
df_party = pd.DataFrame({'PARTY_NUM': party_numbers})
df_account = pd.DataFrame({'ACCT_NUM': account_numbers})
df_role = pd.DataFrame({'Role': roles})

# Save DataFrames to CSV files
df_party.to_csv('party_numbers.csv', index=False)
df_account.to_csv('account_numbers.csv', index=False)
df_role.to_csv('roles.csv', index=False)

# Combine data into a single DataFrame ensuring no account, role combination have the same customer
combined_data = []
used_combinations = set()

for i in range(num_rows_account):
    account = account_numbers[i]
    role = roles[i]
    customer = party_numbers[i]
    if (account, role) not in used_combinations:
        combined_data.append({'Customer': customer, 'Account': account, 'Role': role})
        used_combinations.add((account, role))

# Add remaining party numbers to the combined data
remaining_customers = set(party_numbers) - set([entry['Customer'] for entry in combined_data])
for customer in remaining_customers:
    account = np.random.choice(account_numbers)
    role = np.random.choice(roles)
    combined_data.append({'Customer': customer, 'Account': account, 'Role': role})

# Ensure all account numbers are used at least once
used_accounts = set([entry['Account'] for entry in combined_data])
remaining_accounts = set(account_numbers) - used_accounts
for account in remaining_accounts:
    customer = np.random.choice(party_numbers)
    role = np.random.choice(roles)
    combined_data.append({'Customer': customer, 'Account': account, 'Role': role})

# Ensure all party numbers are used at least once
used_customers = set([entry['Customer'] for entry in combined_data])
remaining_customers = set(party_numbers) - used_customers
for customer in remaining_customers:
    account = np.random.choice(account_numbers)
    role = np.random.choice(roles)
    combined_data.append({'Customer': customer, 'Account': account, 'Role': role})

df_combined = pd.DataFrame(combined_data)

# Save the combined DataFrame to a CSV file
df_combined.to_csv('customer_account_role.csv', index=False)

print("Files generated successfully!")



