<a href="https://colab.research.google.com/github/UdithWeerasinghe/IntelliScript_phase02_BIG/blob/main/A8ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install pandas openpyxl transformers xlrd numpy

In [None]:
import os
from openpyxl import load_workbook, Workbook
from datetime import datetime
import re
import pandas as pd

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Input and output folder paths
input_folder_path = '/content/drive/MyDrive/XLRaw'
output_folder_path = '/content/drive/MyDrive/A10'

# Ensure the output folder exists
os.makedirs(output_folder_path, exist_ok=True)

In [None]:
# Month mapping for textual dates
MONTHS = {
    "january": 1, "february": 2, "march": 3, "april": 4,
    "may": 5, "june": 6, "july": 7, "august": 8,
    "september": 9, "october": 10, "november": 11, "december": 12,
    "jan": 1, "feb": 2, "mar": 3, "apr": 4,
    "jun": 6, "jul": 7, "aug": 8, "sep": 9,
    "oct": 10, "nov": 11, "dec": 12,
}

In [None]:

# Check if a value is a valid date
def is_time_related(value):
    if pd.isna(value):
        return False
    if isinstance(value, datetime):
        return True
    if isinstance(value, str):
        value = re.sub(r'\([^)]*\)', '', value).strip()  # Remove extra characters like (b)
        # Check for known date patterns
        date_patterns = [
            r'^\d{4}$',  # yyyy
            r'^\d{1,2}/\d{4}$',  # mm/yyyy
            r'^\d{4}/\d{1,2}$',  # yyyy/mm
            r'^\d{1,2}/\d{1,2}/\d{4}$',  # mm/dd/yyyy or dd/mm/yyyy
            r'^[a-zA-Z]{3,}-\d{2}$',  # e.g., Jan-19
        ]
        for pattern in date_patterns:
            if re.match(pattern, value):
                return True
        # Check for month names
        value_lower = value.lower()
        if any(month in value_lower for month in MONTHS):
            return True
    return False

# Normalize textual date to datetime
def normalize_date(value, inferred_year=None):
    if isinstance(value, datetime):
        return value
    if isinstance(value, str):
        value = re.sub(r'\([^)]*\)', '', value).strip()  # Remove extra characters like (b)
        try:
            # Handle patterns like "Jan-19" or "January"
            if "-" in value or value.lower() in MONTHS:
                parts = value.split("-")
                month = MONTHS.get(parts[0].lower(), None)
                if month:
                    if len(parts) == 2 and parts[1].isdigit():
                        inferred_year = int("20" + parts[1]) if len(parts[1]) == 2 else int(parts[1])
                        return datetime(inferred_year, month, 1)
                    elif inferred_year:
                        return datetime(inferred_year, month, 1)
            # Handle single month names
            if value.lower() in MONTHS and inferred_year:
                month = MONTHS[value.lower()]
                return datetime(inferred_year, month, 1)
        except Exception:
            pass
    return value  # Return the original value if not normalized

# Process rows or columns for structured and inferred date data
def process_time_structure(cells, inferred_year=None):
    dates = []
    current_year = inferred_year
    for cell in cells:
        if pd.isna(cell):
            break  # Stop processing at the first empty cell
        if is_time_related(cell):
            normalized_date = normalize_date(cell, current_year)
            if isinstance(normalized_date, datetime):
                current_year = normalized_date.year
            dates.append(normalized_date)
        elif current_year:  # Handle inferred textual dates
            normalized_date = normalize_date(cell, current_year)
            dates.append(normalized_date)
        else:
            dates.append(cell)  # Retain original value if not normalized
    return dates


# Extract time-related data
def extract_time_data(sheet):
    time_data = []

    # Row-wise processing
    for row in sheet.iter_rows(values_only=True):
        inferred_year = None
        row_dates = process_time_structure(row, inferred_year)
        time_data.extend(row_dates)

    # Column-wise processing
    for col in sheet.iter_cols(values_only=True):
        inferred_year = None
        col_dates = process_time_structure(col, inferred_year)
        time_data.extend(col_dates)

    # Ensure only datetime or valid strings remain
    time_data = [d for d in time_data if isinstance(d, (datetime, str))]

    # Separate datetime and string values
    datetime_values = sorted([d for d in time_data if isinstance(d, datetime)])
    string_values = sorted([d for d in time_data if isinstance(d, str)])

    # Combine sorted lists
    return datetime_values + string_values


# Extract parameters and their numeric values
def extract_parameters(sheet):
    parameters = {}

    # Row-wise extraction
    for row in sheet.iter_rows(values_only=True):
        for i, cell in enumerate(row):
            if isinstance(cell, str) and not is_time_related(cell):
                values = row[i + 1:]
                if all(isinstance(v, (int, float)) for v in values if v is not None):
                    parameters[cell] = values

    # Column-wise extraction
    for col_idx, col in enumerate(sheet.iter_cols(values_only=True)):
        for i, cell in enumerate(col):
            if isinstance(cell, str) and not is_time_related(cell):
                values = col[i + 1:]
                if all(isinstance(v, (int, float)) for v in values if v is not None):
                    parameters[cell] = values

    return parameters

# Save organized data to an Excel file
def save_organized_data(time_data, parameters, output_file):
    workbook = Workbook()
    sheet = workbook.active
    sheet.title = "Organized Data"

    # Write time data in the first column
    sheet.cell(row=1, column=1, value="Date")
    for idx, time_value in enumerate(time_data, start=2):
        sheet.cell(row=idx, column=1, value=time_value.strftime('%Y-%m-%d') if isinstance(time_value, datetime) else time_value)

    # Write parameters and values
    col_idx = 2
    for param_name, values in parameters.items():
        sheet.cell(row=1, column=col_idx, value=param_name)  # Header
        for row_idx, value in enumerate(values, start=2):
            sheet.cell(row=row_idx, column=col_idx, value=value)
        col_idx += 1

    # Save the workbook
    workbook.save(output_file)
    print(f"Saved: {output_file}")

# Process each Excel sheet
def process_excel_file(input_path, output_folder):
    workbook = load_workbook(input_path, data_only=True)
    file_name = os.path.splitext(os.path.basename(input_path))[0]

    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        print(f"Processing: {file_name} - {sheet_name}")

        time_data = extract_time_data(sheet)
        parameters = extract_parameters(sheet)

        if time_data or parameters:
            output_file = os.path.join(output_folder, f"{file_name}_{sheet_name}_organized.xlsx")
            save_organized_data(time_data, parameters, output_file)
        else:
            print(f"No data found in: {file_name} - {sheet_name}")




In [None]:
# Process all Excel files in the folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.xlsx'):
        input_path = os.path.join(input_folder_path, file_name)
        process_excel_file(input_path, output_folder_path)

Processing: table2.04_20241202_e - HS codes Import Classification 
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_HS codes Import Classification _organized.xlsx
Processing: table2.04_20241202_e - 2.04 In USD 2007-2024
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_2.04 In USD 2007-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In Rupees 2007-2024
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_2.04 In Rupees 2007-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 SITC In USD 2014-2024
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_2.04 SITC In USD 2014-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 SITC In Rupee 2014-2024
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_2.04 SITC In Rupee 2014-2024_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In USD 2006-2010
Saved: /content/drive/MyDrive/A10/table2.04_20241202_e_2.04 In USD 2006-2010_organized.xlsx
Processing: table2.04_20241202_e - 2.04 In Rupees 2006-2