In [1]:
from faker import Faker
import random
import os
from tqdm import tqdm
# Create a Faker instance
fake = Faker()

# Define the number of fake data entries to generate
num_entries = 100

# Define the path to the Dataset folder
dataset_folder = os.path.join("D:\\Sound processing\\myenv", "Dataset")

# Create the Dataset folder if it doesn't exist
os.makedirs(dataset_folder, exist_ok=True)

# Define use cases
use_cases = ["sales", "marketing", "finance", "operations", "hr"]

# Initialize headers and fake data storage
headers = {
    "sales": ["use_case", "product", "price", "quantity"],
    "marketing": ["use_case", "campaign", "budget", "roi"],
    "finance": ["use_case", "account_type", "balance", "interest_rate"],
    "operations": ["use_case", "location", "production_capacity", "utilization_rate"],
    "hr": ["use_case", "employee_name", "job_title", "salary"]
}
fake_tabular_data = []

# Generate fake tabular data
for _ in tqdm(range(num_entries)):
    # Generate a random use case
    use_case = random.choice(use_cases)

    # Generate fake data based on the use case
    if use_case == "sales":
        product = f"{fake.word()} {fake.word()}"
        price = fake.pydecimal(left_digits=3, right_digits=2, positive=True)
        quantity = fake.random_int(1, 100)
        data_entry = [use_case, product, price, quantity]
    elif use_case == "marketing":
        campaign = fake.catch_phrase()
        budget = fake.random_int(10000, 100000)
        roi = fake.pydecimal(left_digits=2, right_digits=2, positive=True)
        data_entry = [use_case, campaign, budget, roi]
    elif use_case == "finance":
        account_type = fake.random_element(elements=("checking", "savings", "credit card"))
        balance = fake.pydecimal(left_digits=5, right_digits=2, positive=True)
        interest_rate = fake.pydecimal(left_digits=2, right_digits=2, positive=True)
        data_entry = [use_case, account_type, balance, interest_rate]
    elif use_case == "operations":
        location = fake.city()
        production_capacity = fake.random_int(1000, 10000)
        utilization_rate = fake.pydecimal(left_digits=2, right_digits=2, positive=True)
        data_entry = [use_case, location, production_capacity, utilization_rate]
    else:  # hr
        employee_name = fake.name()
        job_title = fake.job()
        salary = fake.random_int(30000, 100000)
        data_entry = [use_case, employee_name, job_title, salary]

    fake_tabular_data.append(data_entry)

# Save the fake tabular data to a CSV file in the Dataset folder
fake_tabular_file = os.path.join(dataset_folder, 'fake_tabular_data.txt')
with open(fake_tabular_file, 'w', encoding='utf-8') as file:
    # Write headers (combined from all use cases)
    all_headers = headers["sales"]
    file.write(','.join(all_headers) + '\n')
    
    # Write data entries
    for data_entry in fake_tabular_data:
        file.write(','.join(map(str, data_entry)) + '\n')


100%|██████████| 100/100 [00:00<00:00, 20841.26it/s]


In [5]:
import os
from tqdm import tqdm

def split_text_to_csv(input_text_file):
    with open(input_text_file, 'r') as text_file:
        lines = text_file.readlines()

        # Initialize dictionaries to store lines for each category
        categories = {'sales': [], 'hr': [], 'marketing': [], 'operation': [], 'finance': []}

        # Initialize tqdm for progress tracking
        with tqdm(total=len(lines), desc="Processing lines") as pbar:
            # Iterate through lines and categorize them
            current_category = None
            for line in lines:
                pbar.update(1)  # Increment progress bar
                if 'sales' in line.lower():
                    current_category = 'sales'
                elif 'hr' in line.lower():
                    current_category = 'hr'
                elif 'marketing' in line.lower():
                    current_category = 'marketing'
                elif 'operation' in line.lower():
                    current_category = 'operation'
                elif 'finance' in line.lower():
                    current_category = 'finance'
                
                if current_category and line.strip():
                    categories[current_category].append(line)

        # Create folders for each category if they don't exist
        for category in categories.keys():
            folder_path = f"./Another_dataset/{category}"
            os.makedirs(folder_path, exist_ok=True)

        # Write each category to a separate CSV file in its respective folder
        for category, lines in categories.items():
            output_csv_file = f"./Another_dataset/{category}/{category}.csv"
            with open(output_csv_file, 'w') as csv_file:
                csv_file.writelines(lines)

if __name__ == "__main__":
    input_text_file = "./Dataset/fake_tabular_data.txt"   # Path to your input text file

    split_text_to_csv(input_text_file)


Processing lines: 100%|██████████| 101/101 [00:00<?, ?it/s]
