<a href="https://colab.research.google.com/github/ShabnaIlmi/Data-Science-Group-Project/blob/End_User_Risk_Prediction/EndUser_Generate_Synthetic_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mounting the google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
import numpy as np
import datetime

# 1. Load the Excel file
excel_file = '/content/drive/MyDrive/DSGP_User/Stock report Combined.xlsx'

try:
    df = pd.read_excel(excel_file, sheet_name='TCC')
    print("Excel file loaded successfully.")
except FileNotFoundError:
    print(f"Error: File '{excel_file}' not found. Make sure you have uploaded it to Colab.")
    exit()
except ValueError as e:
    print(f"Error loading sheet 'TCC': {e}")
    exit()

# 2. Display company name counts BEFORE generating synthetic data
print("\n--- Company Name Counts (Before) ---")
print(df['Customer name'].value_counts())

print("\n--- Product Code Counts (Before) ---")
print(df['Product code'].value_counts())


# 2.1. Define a function to generate a single row of synthetic data
def generate_synthetic_data_row():
    """Generates a single row of synthetic data for the stock report."""

    # Generate realistic but random values for each column
    warehouse = np.random.choice(['B4', 'WH-75']) # example add B5 and B6 to warehouse
    product_code = np.random.choice(['Ammonium Nitrate', 'Trinitrotoluene (TNT)', 'Dynamite','Potassium Nitrate','Sodium Nitrate','Sulfuric Acid','Ammonia', 'Hydrochloric Acid','Nitric Acid',
                                     'Hydrogen Peroxide','Acetone','Benzene','Ethanol','Ammonium Sulfate','Potassium Chloride','Potassium Permanganate'])  # Keep this consistent as per your data
    invoice_no = f"SI-N/{np.random.randint(10, 12)}/25/{np.random.randint(100, 999)}"  # Example pattern
    start_date = datetime.date(2024, 11, 6)
    end_date = datetime.date(2025, 2, 9)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = np.random.randint(0, days_between_dates)
    transaction_date = start_date + datetime.timedelta(days=random_number_of_days)
    uom = 'KG'  # Consistent unit of measure
    # received_qty = np.random.randint(0, 10)    #Assuming mostly issued
    issued_qty = np.random.randint(100, 3500) # Random quantity between 30 and 2000
    customer_names = ['Galagedara CHEMICAL', 'PIYARA WASHING ENTERPRISES', 'RIVINTA DRY AND WASHING', 'SN PRODUCTS',
                      'NEW BRILLIANT WASHING', 'KAVISHKA FIBRE', 'S & S CHEMICALS (PVT) LTD', 'OSPREY CLOTHING (PVT) LTD',
                      'DYNAWASH LIMITED', 'ECO WASHING (PVT) LTD', 'LEXUS', 'ROSS DAIRIES PVT LTD', 'EAGLE COIR',
                      'Hirdaramani Group ', 'Hayleys Advantis Ltd', 'Ceylon Oxygen Ltd', 'Union Chemicals Lanka', 'HydroPure Systems', 'Ansell Lanka',
                      'CleanFlow Lanka', 'WaterCare Solutions', 'Brandix Essentials', 'FreshAgro Solutions','Lanka Walltile PLC ',
                      'Laugfs Holdings', 'Chemicals Corporation Ltd', 'Tokyo Cement Company', 'Dipped Products PLC', 'Piramal Glass Ceylon', 'Sanitech Chemicals',
                      'BPL Chemicals', 'SUMITH Cement Corporation', 'kAMAL Rubber Corporation', 'Kohinoor Chemicals', 'Prabha Chemicals', 'Sinopec', 'Unilever',
                      'Hemas Pharmaceuticals ', 'George Steuart Health', 'Lanka Tiles PLC ', 'Richard Pieris & Company PLC', 'Lanka Phos Limited', 'Ceylon Paints Limited ', 'Ceylon Cold Stores (Elephant House)'] # Add more to this list
    customer_name = np.random.choice(customer_names)

    row = {
        'Warehouse': warehouse,
        'Product code': product_code,
        'Invoice No': invoice_no,
        'Transaction Date': transaction_date,
        'UOM': uom,
        'Issued Qty ': issued_qty,
        'Customer name': customer_name
    }
    return row

# 3. Generate 1000 rows of synthetic data and append to the DataFrame
num_rows_to_generate = 2000

synthetic_data = []
for _ in range(num_rows_to_generate):
    synthetic_data.append(generate_synthetic_data_row())

df_synthetic = pd.DataFrame(synthetic_data)
df = pd.concat([df, df_synthetic], ignore_index=True)

print(f"{num_rows_to_generate} rows of synthetic data generated and appended.")

# 3.1. Display company name counts AFTER generating synthetic data
print("\n--- Company Name Counts (After) ---")
print(df['Customer name'].value_counts())

print("\n--- Product Code Counts (After) ---")
print(df['Product code'].value_counts())

# 4. Save the updated DataFrame back to the Excel file
output_file = '/content/drive/MyDrive/DSGP_User/Pre Processed data/Stock-report_generated.xlsx'  # Use a different name to avoid overwriting
df.to_excel(output_file, sheet_name='TCC', index=False)

print(f"Updated data saved to '{output_file}'")


Excel file loaded successfully.

--- Company Name Counts (Before) ---
Customer name
THARINDU WASHING PVT LTD     4
Piramal Glass Ceylon         3
Ansell Lanka                 3
OCEANPICK PVT LTD            3
DYNAWASH LIMITED             3
                            ..
Tokyo Cement Company         1
Sanitech Chemicals           1
ChlorinePlus Pvt Ltd         1
WaterCare Solutions          1
Chemicals Corporation Ltd    1
Name: count, Length: 61, dtype: int64

--- Product Code Counts (Before) ---
Product code
H2O2-50%-IG-TCC-BANGLADESH    40
LC-IG-ISGEC-INDIA             21
H2O2-50%-IG-ICL-BANGLADESH    14
LC-IG-PCIPL-INDIA             12
Name: count, dtype: int64
2000 rows of synthetic data generated and appended.

--- Company Name Counts (After) ---
Customer name
Galagedara CHEMICAL         64
Union Chemicals Lanka       55
SN PRODUCTS                 55
LEXUS                       54
kAMAL Rubber Corporation    53
                            ..
MEDITEXTILE PVT LTD          1
Sanitech