Lookup Generator

In [None]:
 import pandas as pd
 import numpy as np
 import glob
 import random
 import string
 from faker import Faker

 faker=Faker()

 # Function to generate random string of a given length
 def random_string(length):
     return ''.join(random.choices(string.ascii_lowercase, k=length))
 # Function to generate random integer with a specific number of digits
 def random_integer(min_digits, max_digits):
     min_value = 10**(min_digits - 1)
     max_value = 10**max_digits - 1
     return np.random.randint(min_value, max_value + 1,dtype=np.int64)
 # Function to generate data based on schema
 def generate_data(schema, num_rows):
     data = {}
     for index, row in schema.iterrows():
         col = row[0]
         dtype_info = row[1]
         dtype_parts = dtype_info.split(':')
         dtype = dtype_parts[0]
         min_length = int(dtype_parts[1]) if len(dtype_parts) > 1 else None
         max_length = int(dtype_parts[2]) if len(dtype_parts) > 2 else None
         if dtype == 'int':
             if min_length and max_length:
                 data[col] = [random_integer(min_length, max_length) for _ in range(num_rows)]
             else:
                 data[col] = np.random.randint(1, 100, num_rows)
         elif dtype == 'float':
             data[col] = np.random.rand(num_rows) * 100
         elif dtype == 'name':
             data[col] = [faker.name() for _ in range(num_rows)]
         elif dtype == 'date':
             data[col] = [faker.date() for _ in range(num_rows)]
         elif dtype == 'datetime':
             data[col] = [faker.date_time().isoformat() for _ in range(num_rows)]
         elif dtype == 'LOB':
             data[col] = np.random.choice(['MORTGAGE'], num_rows)
         elif dtype == 'str':
             if min_length and max_length:
                 lengths = np.random.randint(min_length, max_length + 1, num_rows)
                 data[col] = [random_string(length) for length in lengths]
             elif min_length:
                 data[col] = [random_string(min_length) for _ in range(num_rows)]
             else:
                 data[col] = np.random.choice(['Madrid', 'Liverpool'], num_rows)
         else:
             data[col] = np.random.choice(['unknown'], num_rows)
     return pd.DataFrame(data)
 # Load all transposed schema files
 schema_files = glob.glob('lookup_schema.csv')  # Adjust the pattern if needed
 # Generate and save test data for each schema file
 num_rows = 10
 for schema_file in schema_files:
     transposed_schema = pd.read_csv(schema_file, header=None)
     test_data = generate_data(transposed_schema, num_rows)
     output_csv_file = 'lookup.csv'
     output_json_file = 'lookup.json'
     test_data.to_csv(output_csv_file, index=False)
     test_data.to_json(output_json_file,index=False)
     print(f"Test data generated and saved to '{output_csv_file}'") 

JSON CONVERTER

In [52]:
import os
import csv
import json

# Function to convert a single CSV file to JSON
def convert_csv_to_json(file_path):
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        data = [row for row in csv_reader]
    return data

# Directory containing the CSV files
directory = 'output'  # Replace with the actual directory containing the CSV files

# List to store JSON data for all files
all_files_data = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):  # Process only .csv files
        file_path = os.path.join(directory, filename)
        file_data = convert_csv_to_json(file_path)
        all_files_data.append({
            'filename': filename,
            'content': file_data
        })

# Convert the list of all files data to JSON format
json_data = json.dumps(all_files_data, indent=4)

# Save the JSON data to a file
with open('all_csv_files_data.json', 'w') as json_file:
    json_file.write(json_data)

print("All CSV files have been converted to JSON and saved to 'all_csv_files_data.json'.")


All CSV files have been converted to JSON and saved to 'all_csv_files_data.json'.


WITH LOOKUP

In [None]:
 import pandas as pd
 import numpy as np
 import glob
 import random
 import string

 from faker import Faker

 faker=Faker()

 # Function to generate random string of a given length
 def random_string(length):
     return ''.join(random.choices(string.ascii_lowercase, k=length))
 # Function to generate random integer with a specific number of digits 
 def random_integer(min_digits, max_digits):
     min_value = 10**(min_digits - 1)
     max_value = 10**max_digits - 1
     return np.random.randint(min_value, max_value + 1,dtype=np.int64)
 # Function to generate data based on schema
 def generate_data(schema, num_rows, lookup_data=None):
     data = {}
     for index, row in schema.iterrows():
         col = row[0]
         dtype_info = row[1]
         dtype_parts = dtype_info.split(':')
         dtype = dtype_parts[0]
         min_length = int(dtype_parts[1]) if len(dtype_parts) > 1 else None
         max_length = int(dtype_parts[2]) if len(dtype_parts) > 2 else None
         if lookup_data is not None and col in lookup_data.columns:
             data[col] = np.random.choice(lookup_data[col].values, num_rows)
         elif dtype == 'int':
             if min_length and max_length:
                 data[col] = [random_integer(min_length, max_length) for _ in range(num_rows)]
             else:
                 data[col] = np.random.randint(1, 100, num_rows)
         elif dtype == 'float':
             data[col] = np.random.rand(num_rows) * 100
         elif dtype == 'name':
             data[col] = [faker.name() for _ in range(num_rows)]
         elif dtype == 'date':
             data[col] = [faker.date() for _ in range(num_rows)]
         elif dtype == 'datetime':
             data[col] = [faker.date_time().isoformat() for _ in range(num_rows)]
         elif dtype == 'bool':
             data[col] = np.random.choice(['Y', 'N'], num_rows)
         elif dtype == 'str':
             if min_length and max_length:
                 lengths = np.random.randint(min_length, max_length + 1, num_rows)
                 data[col] = [random_string(length) for length in lengths]
             elif min_length:
                 data[col] = [random_string(min_length) for _ in range(num_rows)]
             else:
                 data[col] = np.random.choice(['AUTO', 'DEPOSITS', 'MORTGAGE'], num_rows)
         else:
             data[col] = np.random.choice(['unknown'], num_rows)
     return pd.DataFrame(data)
 # Load all transposed schema files
 schema_files = glob.glob('schema_*.csv')  # Adjust the pattern if needed
 # Load the lookup file
 lookup_data = pd.read_csv('lookup.csv')  # Adjust the file name if needed
 # Generate and save test data for each schema file
 num_rows = 100
 for schema_file in schema_files:
     transposed_schema = pd.read_csv(schema_file, header=None)
     test_data = generate_data(transposed_schema, num_rows, lookup_data=lookup_data)
     output_file = schema_file.replace('schema_', 'test_data_')  # Create corresponding output file name
     test_data.to_csv(output_file, index=False)
     print(f"Test data generated and saved to '{output_file}'") 