## Getting categories with codes out of dataset

In [7]:
# Imports 
import pandas as pd
import json
import re

# Read the Excel file
df = pd.read_excel('RESOURCES.xlsx')

def clean_category2(category):
    if pd.isna(category):
        return category
    # Remove first number group (e.g., "01 06 Logistics" -> "06 Logistics")
    return re.sub(r'^\d+\s+', '', category)

def clean_category3(category):
    if not isinstance(category, str):
        return category
        
    # Split the string into parts
    parts = category.split()
    
    # Find the last number-text pair
    number = None
    text = []
    
    for i in range(len(parts)-1, -1, -1):
        if re.match(r'^\d+$', parts[i]) and not number:
            number = parts[i]
            break
        text.insert(0, parts[i])
    
    # If no number found, try to extract from beginning
    if not number and text:
        match = re.match(r'^(\d+)\s*(.*)', text[0])
        if match:
            number = match.group(1)
            text[0] = match.group(2)
    
    # Combine number and text
    if number and text:
        return f"{number} {' '.join(text)}"
    return category

# Clean categories
df['Category 2'] = df['Category 2'].apply(clean_category2)
df['Category 3'] = df['Category 3'].apply(clean_category3)

# Extract unique values from Category1, Category2, and Category3 columns
unique_category1 = df['Category 1'].dropna().unique().tolist()
unique_category2 = df['Category 2'].dropna().unique().tolist()
unique_category3 = df['Category 3'].dropna().unique().tolist()

# Create a dictionary to hold the unique categories
unique_categories = {
    "Category1": unique_category1,
    "Category2": unique_category2,
    "Category3": unique_category3
}

# Save to JSON file
with open('unique_categories_final.json', 'w') as json_file:
    json.dump(unique_categories, json_file, indent=4)

print("Unique categories have been saved to unique_categories.json")

Unique categories have been saved to unique_categories.json


In [15]:
# What unique values are in the categories columns
data = pd.read_excel('RESOURCES.xlsx')
print(data['Category 1'].nunique() + data['Category 2'].nunique() + data['Category 3'].nunique())

376


In [8]:
print(data['Category 2'].unique())

['01 06 Logistics' '06 03 Compost, Peat Etc' '01 10 Equipment'
 '01 04 Manufacturing / Processing' '06 05 Green Waste'
 '06 02 Arable Produce & Wastes (Non-Fuel)' '04 02 Heat' '20 05 Other'
 '16 02 Batteries' '12 19 Solvents' '03 04 Paints & Varnishes'
 '19 04 Plastic (Unspecified)' '01 11 Labour'
 '06 06 Other Food & Beverage Wastes' '06 07 Prepared Food Waste'
 '12 18 Soaps, Detergents, Oils, Waxes, Etc' '20 14 Polyurethane PU PUR'
 '99 99 Unspecified' '22 02 Product' '22 03 Processed Wood' ' '
 '20 99 Unspecified' '15 05 Wastewater' '01 09 Waste Handling Facilities'
 '22 01 Production' '12 01 Alcohols, Glycols, Glycerol '
 '07 07 Calcium Compounds (See Also Alkalis)' '07 02 Alkalies'
 '12 11 Organic Acids, Salts & Anhydrides ' '10 03 Pure Metals'
 '07 03 Aluminum Compounds' '21 04 Tyres' '10 01 Alloys, Including Steel'
 '07 18 Nitrogen Compounds, Ammonium Salts' '01 03 Laboratories'
 '14 02 Leather' '01 08 Storage' '07 99 Unspecified' '01 07 Office Space'
 '11 01 Ceramics, Refractor

In [10]:
print(data['Category 3'].unique(), data['Category 3'].nunique())

['01 06 02 Road Transport' '06 03 04 Topsoil' '01 10 02 To Use On - Site'
 '01 04 02 Manufacturing' '06 05 01 Fruit/Vegetable Waste - Compostable'
 '06 02 02 Other' '06 02 99 Unspecified' '04 02 01 Hot water'
 '20 05 01 Other' '16 02 04 Other Batteries' '12 19 01 Acetone'
 '03 04 01 Other Paints & Varnishes (incl. Enamels)'
 '19 04 01 Plastic (Unspecified)' '01 11 01 Manpower'
 '06 06 03 Other Food Sludges & Wastes' '04 02 02 Other'
 '06 07 02 Not Containing Animal Products'
 '12 18 02 Lubricating Preparations, & Other Oils Etc'
 '20 14 01 Polyurethane PU PUR' '99 99 99 Unspecified' '22 02 03 Off cuts'
 '22 03 99 Unspecified' ' ' '20 99 99 Unspecified'
 '15 05 05 Industrial Process Plant Water' '15 05 99 Unspecified'
 '01 09 03 Other' '22 01 99 Unspecified' '12 01 99 Unspecified'
 '07 07 99 Unspecified' '07 02 03 Other Alkalis' '12 11 07 Lactic Acid'
 '04 02 99 Unspecified' '10 03 23 Aluminium'
 '07 03 03 Other Aluminium Compounds' '10 03 10 Other' '21 04 01 Tyres'
 '10 01 01 Aluminium