## Processing Post-Harvest datasets

In [1]:
import pandas as pd

In [None]:
def transform_data(sheets_dict: dict[pd.DataFrame], 
                   lookup: dict[str,str], 
                   file_origin: str) -> pd.DataFrame:
    # mapping from file_origin to rec_type letter
    rec_map = {
        "harvested_area": "H",
        "physical_area":  "A",
        "yield":          "Y",
        "production":     "P"
    }
    # mapping tech suffix → human label
    tech_map = {
        "A": "all technologies",
        "I": "irrigation",
        "R": "rainfed"
    }
    rec_code = rec_map[file_origin]
    
    long_dfs = []
    for sheet_name, df in sheets_dict.items():
        # 1) infer tech_type from sheet name suffix (_TA, _TI, _TR → A, I, R)
        if sheet_name.endswith("(TA)"):
            tech = "A"
        elif sheet_name.endswith("(TI)"):
            tech = "I"
        elif sheet_name.endswith("(TR)"):
            tech = "R"
        else:
            raise ValueError(f"Unrecognized tech suffix in '{sheet_name}'")
        
        # 2) Rename x/y → latitude/longitude
        df = df.rename(columns={
            "x": lookup["x"],
            "y": lookup["y"]
        })
        
        # 3) Rename each crop column: e.g. BANA_I → Banana_I
        rename_map = {}
        for col in df.columns:
            if col.upper().endswith(f"_{tech}"):
                prefix = col.rsplit("_", 1)[0].lower()  # e.g. "bana"
                full_crop = lookup.get(prefix)
                if full_crop is None:
                    # skip any non-crop or unexpected
                    continue
                rename_map[col] = f"{full_crop}_{tech}"
        df = df.rename(columns=rename_map)
        
        # 4) Melt to long: keep all non-crop columns as id_vars
        crop_cols = list(rename_map.values())
        id_vars = [c for c in df.columns if c not in crop_cols]
        df_long = df.melt(
            id_vars=id_vars,
            value_vars=crop_cols,
            var_name="variable",
            value_name="value"
        )
        
        # 5) Split “variable” → Crop type & tech_type
        df_long["tech_type"] = df_long["variable"].str[-1]
        df_long["Crop type"] = df_long["variable"].str[:-2]
        df_long = df_long.drop(columns=["variable"])
        
        # 6) Add water‐management regime & rec_type
        df_long["water‐management regime"] = df_long["tech_type"].map(tech_map)
        df_long["rec_type"] = rec_code
        
        # 7) Reorder & select final columns
        final_cols = [
            lookup["y"],              # latitude of pixel center
            lookup["x"],              # longitude of pixel center
            "Crop type",
            "tech_type",
            "water‐management regime",
            "rec_type",
            "ADM1_NAME",
            "ADM2_NAME",
            "unit",
            "grid_code",
            "year_data",
            "value"
        ]
        # (rename latitude/longitude to the exact strings)
        df_long = df_long[final_cols]
        
        long_dfs.append(df_long)
    
    # 8) Concatenate all tech-types for this file
    result = pd.concat(long_dfs, ignore_index=True)
    return result

# Example usage:
# import json
# lookup = json.load(open("lookup.json"))
# harvested_sheets = pd.read_excel("harvested_area.xlsx", sheet_name=None)
# ha_long = transform_data(harvested_sheets, lookup, "harvested_area")
# physical_sheets = pd.read_excel("physical_area.xlsx", sheet_name=None)
# pa_long = transform_data(physical_sheets, lookup, "physical_area")
# ... and similarly for yield & production




In [15]:
import json
lookup = json.load(open("lookup.json"))
harvested_sheets = pd.read_excel("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/harvested_area.xlsx", sheet_name=None)
ha_long = transform_data(harvested_sheets, lookup, "harvested_area")
physical_sheets = pd.read_excel("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/Physical_area.xlsx", sheet_name=None)
pa_long = transform_data(physical_sheets, lookup, "physical_area")


In [16]:
ha_long

Unnamed: 0,longitude of pixel center,latitude of pixel center,Crop type,tech_type,water‐management regime,rec_type,ADM1_NAME,ADM2_NAME,unit,grid_code,year_data,value
0,13.87500,5.54167,Banana,A,all technologies,H,Sokoto,Illela,ha,3946386,avg(2019-2021),0.0
1,13.79170,5.37500,Banana,A,all technologies,H,Sokoto,Gada,ha,3950704,avg(2019-2021),0.0
2,13.79170,5.79167,Banana,A,all technologies,H,Sokoto,Illela,ha,3950709,avg(2019-2021),0.0
3,13.70830,4.70833,Banana,A,all technologies,H,Sokoto,Gudu,ha,3955016,avg(2019-2021),0.0
4,13.70830,4.79167,Banana,A,all technologies,H,Sokoto,Gudu,ha,3955017,avg(2019-2021),0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1136333,4.37500,6.54167,Yams,R,rainfed,H,Bayelsa,Brass,ha,4438878,avg(2019-2021),0.7
1136334,4.37500,6.62500,Yams,R,rainfed,H,Bayelsa,Brass,ha,4438879,avg(2019-2021),17.7
1136335,4.37500,6.79167,Yams,R,rainfed,H,Rivers,Akuku-Toru,ha,4438881,avg(2019-2021),1.9
1136336,4.29167,6.12500,Yams,R,rainfed,H,Bayelsa,Brass,ha,4443193,avg(2019-2021),11.1


In [17]:
lookup = json.load(open("lookup.json"))
Production_sheets = pd.read_excel("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/Production.xlsx", sheet_name=None)
Prod_long = transform_data(Production_sheets, lookup, "production")
yield_sheets = pd.read_excel("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/yield.xlsx", sheet_name=None)
yield_long = transform_data(yield_sheets, lookup, "yield")

In [24]:
# Save the transformed data to excel (XLSX) files
ha_long.to_csv("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/processed/harvested_long.xlsx", index=False)
pa_long.to_csv("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/processed/physical_long.xlsx", index=False)
yield_long.to_csv("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/processed/yield_long.xlsx", index=False)
Prod_long.to_csv("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset/harvest_data/processed/production_long.xlsx", index=False)

## Other datasets

In [25]:
import os
import pandas as pd

def extract_data(folder_path):
    extracted_data = {}
    suffix_counter = {}

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        base_name, ext = os.path.splitext(file_name)

        if ext.lower() in ['.csv', '.xlsx', '.xls']:
            try:
                if ext.lower() == '.csv':
                    # Single DataFrame for CSV
                    df_dict = {base_name: pd.read_csv(file_path)}
                else:
                    # Read all sheets into a dict of DataFrames
                    df_dict = pd.read_excel(file_path, sheet_name=None)
                    # Prefix sheet names with base filename
                    df_dict = {f"{base_name}_{sheet_name}": df
                               for sheet_name, df in df_dict.items()}

                for key, df in df_dict.items():
                    unique_key = key
                    # Handle key collisions
                    if unique_key in extracted_data:
                        # Initialize counter for this key if needed
                        if key not in suffix_counter:
                            suffix_counter[key] = 1
                        suffix_counter[key] += 1
                        unique_key = f"{key}_{suffix_counter[key]}"

                    extracted_data[unique_key] = df

            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

    return extracted_data


In [39]:
#Specify full folder path to the dataset
df = extract_data("C:/Users/USER/Downloads/Hackathon/AgricConnect-PHL/dataset")

  df_dict = {base_name: pd.read_csv(file_path)}
  df_dict = {base_name: pd.read_csv(file_path)}


In [40]:
df.keys()

dict_keys(['Data', 'FAOSTAT_data_en_5-8-2025', 'global-market-monitor_subnational', 'nga-rainfall-adm2-full', 'Population data', 'wfp_food_prices_nga'])

In [None]:
logistics= df['Data']


In [44]:
logistics.columns

Index(['m49_code', 'country', 'region', 'cpc_code', 'commodity', 'year',
       'loss_percentage', 'loss_percentage_original', 'loss_quantity',
       'activity', 'food_supply_stage', 'treatment', 'cause_of_loss',
       'sample_size', 'method_data_collection', 'reference', 'url', 'notes'],
      dtype='object')

In [47]:
logistics=logistics[(logistics['year'] >= 2019) & (logistics['year'] <= 2021)][['m49_code', 'country', 'region', 'cpc_code', 'commodity', 'year',
        'loss_percentage', 'loss_percentage_original', 'loss_quantity',
        'activity', 'food_supply_stage', 'treatment', 'cause_of_loss',
        'sample_size', 'method_data_collection']]

In [49]:
logistics.fillna("N/A", inplace=True)

In [None]:
logistics

Unnamed: 0,m49_code,country,region,cpc_code,commodity,year,loss_percentage,loss_percentage_original,loss_quantity,activity,food_supply_stage,treatment,cause_of_loss,sample_size,method_data_collection
0,566,Nigeria,Katsina,112.0,Maize (corn),2021,2.37259,2.37259,,Transportation,Farm,,,,Modelled Estimates
1,566,Nigeria,Katsina,112.0,Maize (corn),2021,4.58573,4.58573,,Storage,Farm,,,,Modelled Estimates
2,566,Nigeria,Katsina,112.0,Maize (corn),2021,6.42,6.42,,"Drying, Harvesting",Harvest,,,,Modelled Estimates
3,566,Nigeria,Katsina,112.0,Maize (corn),2021,4.0,4.0,,Drying,Farm,,,,Modelled Estimates
4,566,Nigeria,Katsina,112.0,Maize (corn),2021,1.32,1.32,,"Shelling, Threshing",,,,,Modelled Estimates
5,566,Nigeria,Benue State,113.0,Rice,2021,2.5,2.5,,Winnowing,Farm,,,,Modelled Estimates
6,566,Nigeria,Benue State,113.0,Rice,2021,4.40333,4.40333,,"Drying, Harvesting",Harvest,,,,Modelled Estimates
7,566,Nigeria,Benue State,113.0,Rice,2021,3.13714,3.13714,,"Shelling, Threshing",,,,,Modelled Estimates
8,566,Nigeria,Benue State,113.0,Rice,2021,0.980316,0.980316,,Storage,Farm,,,,Modelled Estimates
9,566,Nigeria,Benue State,113.0,Rice,2021,1.25,1.25,,Transportation,Farm,,,,Modelled Estimates
