<a href="https://colab.research.google.com/github/UdithWeerasinghe/IntelliScript_phase02_BIG/blob/main/A6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
from openpyxl import load_workbook, Workbook
from datetime import datetime
import pandas as pd
import re

# Input and output folder paths
input_folder_path = '/content/drive/MyDrive/XLRaw'
output_folder_path = '/content/drive/MyDrive/A7'

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

# Check if a value is time-related
def is_time_related(value):
    if pd.isna(value):
        return False
    if isinstance(value, datetime):
        return True
    if isinstance(value, str):
        date_patterns = [
            r'^\d{4}$',  # yyyy
            r'^\d{1,2}/\d{4}$',  # mm/yyyy
            r'^\d{4}/\d{1,2}$',  # yyyy/mm
            r'^\d{1,2}/\d{1,2}/\d{4}$',  # mm/dd/yyyy or dd/mm/yyyy
        ]
        for pattern in date_patterns:
            if re.match(pattern, value):
                try:
                    datetime.strptime(value, '%Y') if pattern == r'^\d{4}$' else datetime.strptime(value, '%m/%Y')
                    return True
                except ValueError:
                    continue
    return False

# Check if a value is a parameter name (string, not numeric or time-related)
def is_parameter_name(value):
    return isinstance(value, str) and not is_time_related(value) and not value.replace('.', '', 1).isdigit()

# Extract time data
def extract_time_data(sheet):
    time_data = set()
    for row in sheet.iter_rows(values_only=True):
        for cell in row:
            if is_time_related(cell):
                time_data.add(cell)
    return sorted(time_data)


# Extract parameters and their numeric values
def extract_parameters(sheet):
    parameters = {}

    # Row-wise extraction
    for row in sheet.iter_rows(values_only=True):
        for i, cell in enumerate(row):
            if is_parameter_name(cell):
                values = row[i + 1:]
                if all(isinstance(v, (int, float)) for v in values if v is not None):
                    if cell in parameters:
                        # Ensure values are stored as a list
                        if isinstance(parameters[cell], tuple):
                            parameters[cell] = list(parameters[cell])
                        parameters[cell].extend(values)
                    else:
                        parameters[cell] = list(values)  # Store as list

    # Column-wise extraction
    for col_idx, col in enumerate(sheet.iter_cols(values_only=True)):
        for i, cell in enumerate(col):
            if is_parameter_name(cell):
                values = col[i + 1:]
                if all(isinstance(v, (int, float)) for v in values if v is not None):
                    if cell in parameters:
                        # Ensure values are stored as a list
                        if isinstance(parameters[cell], tuple):
                            parameters[cell] = list(parameters[cell])
                        parameters[cell].extend(values)
                    else:
                        parameters[cell] = list(values)  # Store as list

    return parameters


# Save organized data to an Excel file
def save_organized_data(time_data, parameters, output_file):
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = "Organized Data"

    # Write time data in the first column
    sheet.cell(row=1, column=1, value="Date")
    for idx, time_value in enumerate(time_data, start=2):
        sheet.cell(row=idx, column=1, value=time_value)

    # Write parameters and values
    col_idx = 2
    for param_name, values in parameters.items():
        sheet.cell(row=1, column=col_idx, value=param_name)  # Header
        for row_idx, value in enumerate(values, start=2):
            sheet.cell(row=row_idx, column=col_idx, value=value)
        col_idx += 1

    # Save the workbook
    workbook.save(output_file)
    print(f"Saved: {output_file}")

# Process each Excel sheet
def process_excel_file(input_path, output_folder):
    workbook = load_workbook(input_path, data_only=True)
    file_name = os.path.splitext(os.path.basename(input_path))[0]

    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        print(f"Processing: {file_name} - {sheet_name}")

        time_data = extract_time_data(sheet)
        parameters = extract_parameters(sheet)

        if time_data or parameters:
            output_file = os.path.join(output_folder, f"{file_name}_{sheet_name}_organized.xlsx")
            save_organized_data(time_data, parameters, output_file)
        else:
            print(f"No data found in: {file_name} - {sheet_name}")

# Process all Excel files in the folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.xlsx'):
        input_path = os.path.join(input_folder_path, file_name)
        process_excel_file(input_path, output_folder_path)


Processing: table2.04_20241202_e - HS codes Import Classification 
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_HS codes Import Classification _organized.xlsx
Processing: table2.04_20241202_e - 2.04 In USD 2007-2024
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_2.04 In USD 2007-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In Rupees 2007-2024
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_2.04 In Rupees 2007-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 SITC In USD 2014-2024
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_2.04 SITC In USD 2014-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 SITC In Rupee 2014-2024
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_2.04 SITC In Rupee 2014-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In USD 2006-2010
Saved: /content/drive/MyDrive/A7/table2.04_20241202_e_2.04 In USD 2006-2010_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In Rupees 2006-2010
Sa