In [None]:
import pandas as pd

In [19]:
import os
import pandas as pd

def extract_data(folder_path):
    extracted_data = {}
    suffix_counter = {}

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        base_name, ext = os.path.splitext(file_name)

        if ext.lower() in ['.csv', '.xlsx', '.xls']:
            try:
                if ext.lower() == '.csv':
                    # Single DataFrame for CSV
                    df_dict = {base_name: pd.read_csv(file_path)}
                else:
                    # Read all sheets into a dict of DataFrames
                    df_dict = pd.read_excel(file_path, sheet_name=None)
                    # Prefix sheet names with base filename
                    df_dict = {f"{base_name}_{sheet_name}": df
                               for sheet_name, df in df_dict.items()}

                for key, df in df_dict.items():
                    unique_key = key
                    # Handle key collisions
                    if unique_key in extracted_data:
                        # Initialize counter for this key if needed
                        if key not in suffix_counter:
                            suffix_counter[key] = 1
                        suffix_counter[key] += 1
                        unique_key = f"{key}_{suffix_counter[key]}"

                    extracted_data[unique_key] = df

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

    return extracted_data


In [22]:
#Specify full folder path to the dataset
df = extract_data("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data")

In [21]:
df.keys()

dict_keys(['harvested_area_Total Area (TA)', 'harvested_area_Irrigated Portion (TR)', 'harvested_area_Total Area(TA)', 'harvested_area_Irrigated Portion (TI)', 'harvested_area_Rainfed Portion(TR)', 'Physical_area_Total Area (TA)', 'Physical_area_Sheet1', 'Physical_area_Irrigated Portion (TI)', 'Physical_area_Rainfed Portion (TR)', 'Production_Total Area (TA)', 'Production_Irrigated Portion (TI)', 'Production_Rainfed Portion (TR)', 'yield_Total Area (TA)', 'yield_Irrigated Portion (TI)', 'yield_Rainfed Portion (TR)'])

In [None]:
def transform_data(sheets_dict: dict[pd.DataFrame], 
                   lookup: dict[str,str], 
                   file_origin: str) -> pd.DataFrame:
    """
    sheets_dict: { sheet_name: DataFrame, ... }
    lookup: { "whea": "Wheat", ..., "x": "latitude of pixel center", "y": "longitude of pixel center" }
    file_origin: one of "harvested_area", "physical_area", "yield", "production"
    """
    # mapping from file_origin to rec_type letter
    rec_map = {
        "harvested_area": "H",
        "physical_area":  "A",
        "yield":          "Y",
        "production":     "P"
    }
    # mapping tech suffix → human label
    tech_map = {
        "A": "all technologies",
        "I": "irrigation",
        "R": "rainfed"
    }
    rec_code = rec_map[file_origin]
    
    long_dfs = []
    for sheet_name, df in sheets_dict.items():
        # 1) infer tech_type from sheet name suffix (_TA, _TI, _TR → A, I, R)
        if sheet_name.endswith("_TA"):
            tech = "A"
        elif sheet_name.endswith("_TI"):
            tech = "I"
        elif sheet_name.endswith("_TR"):
            tech = "R"
        else:
            raise ValueError(f"Unrecognized tech suffix in '{sheet_name}'")
        
        # 2) Rename x/y → latitude/longitude
        df = df.rename(columns={
            "x": lookup["x"],
            "y": lookup["y"]
        })
        
        # 3) Rename each crop column: e.g. BANA_I → Banana_I
        rename_map = {}
        for col in df.columns:
            if col.upper().endswith(f"_{tech}"):
                prefix = col.rsplit("_", 1)[0].lower()  # e.g. "bana"
                full_crop = lookup.get(prefix)
                if full_crop is None:
                    # skip any non-crop or unexpected
                    continue
                rename_map[col] = f"{full_crop}_{tech}"
        df = df.rename(columns=rename_map)
        
        # 4) Melt to long: keep all non-crop columns as id_vars
        crop_cols = list(rename_map.values())
        id_vars = [c for c in df.columns if c not in crop_cols]
        df_long = df.melt(
            id_vars=id_vars,
            value_vars=crop_cols,
            var_name="variable",
            value_name="value"
        )
        
        # 5) Split “variable” → Crop type & tech_type
        df_long["tech_type"] = df_long["variable"].str[-1]
        df_long["Crop type"] = df_long["variable"].str[:-2]
        df_long = df_long.drop(columns=["variable"])
        
        # 6) Add water‐management regime & rec_type
        df_long["water‐management regime"] = df_long["tech_type"].map(tech_map)
        df_long["rec_type"] = rec_code
        
        # 7) Reorder & select final columns
        final_cols = [
            lookup["y"],              # latitude of pixel center
            lookup["x"],              # longitude of pixel center
            "Crop type",
            "tech_type",
            "water‐management regime",
            "rec_type",
            "ADM1_NAME",
            "ADM2_NAME",
            "unit",
            "grid_code",
            "year_data",
            "value"
        ]
        # (rename latitude/longitude to the exact strings)
        df_long = df_long[final_cols]
        
        long_dfs.append(df_long)
    
    # 8) Concatenate all tech-types for this file
    result = pd.concat(long_dfs, ignore_index=True)
    return result

# Example usage:
# import json
# lookup = json.load(open("lookup.json"))
# harvested_sheets = pd.read_excel("harvested_area.xlsx", sheet_name=None)
# ha_long = transform_data(harvested_sheets, lookup, "harvested_area")
# physical_sheets = pd.read_excel("physical_area.xlsx", sheet_name=None)
# pa_long = transform_data(physical_sheets, lookup, "physical_area")
# ... and similarly for yield & production




In [None]:
# Define the lookup dictionary
lookup = "lookup.json"

# Transform the data for each file origin
harvested_long = transform_data(
    {k: v for k, v in df.items() if "harvested_area" in k.lower()},
    lookup,
    "harvested_area"
)

physical_long = transform_data(
    {k: v for k, v in df.items() if "physical_area" in k.lower()},
    lookup,
    "physical_area"
)

yield_long = transform_data(
    {k: v for k, v in df.items() if "yield" in k.lower()},
    lookup,
    "yield"
)

production_long = transform_data(
    {k: v for k, v in df.items() if "production" in k.lower()},
    lookup,
    "production"
)

# Example: Display the transformed data for harvested_area
print(harvested_long.head())