In [6]:
import re
import csv

def clean_unspoiled_data(input_filename, output_filename):
    # Read the entire file as a string
    with open(input_filename, 'r', encoding='utf-8') as f:
        data = f.read()
    
    lines = data.split('\n')

    # We’ll store rows of [Time, Item Name, Location, Coordinates].
    rows = []

    for line in lines:
        line = line.strip()

        # We're looking for lines in the format:
        # |Time || {{item icon|Item Name}} || Slot || [[Location]] || (x..,y..) ...
        if not line.startswith('|'):
            continue
        
        # Split on '||'
        parts = [p.strip() for p in line.split('||')]
        if len(parts) < 5:
            continue

        # Extract the fields we care about:
        # parts[0] -> time  (remove leading '|')
        # parts[1] -> item
        # parts[3] -> location
        # parts[4] -> coordinate
        time = parts[0].lstrip('|').strip()
        item = parts[1]
        location = parts[3]
        coordinate = parts[4]

        # Combine entire line to detect questlink if needed
        entire_line = ' '.join(parts)

        # Skip if 'questlink' is in the line
        if re.search(r'questlink', entire_line, re.IGNORECASE):
            continue

        # Clean up the item name:
        # e.g., {{item icon|Broad Beans}} => Broad Beans
        item_clean = re.sub(r'\{\{.*?\|([^\}]+)\}\}', r'\1', item)
        # Remove leftover braces, "Collectable", "(Item)" text, etc.
        item_clean = re.sub(r'[{}]|\(Item\)|Collectable', '', item_clean).strip()

        # Skip if the item name has the word "cluster" (case-insensitive)
        if re.search(r'cluster', item_clean, re.IGNORECASE):
            continue

        # Clean up location, e.g. [[Il Mheg]] => Il Mheg
        location_clean = re.sub(r'\[\[|\]\]', '', location).strip()

        # Add the row
        rows.append([time, item_clean, location_clean, coordinate])

    # Write results to a CSV
    with open(output_filename, 'w', encoding='utf-8', newline='') as out_csv:
        writer = csv.writer(out_csv)
        writer.writerow(["Time", "Item Name", "Location", "Coordinates"])
        writer.writerows(rows)

if __name__ == "__main__":
    input_file = "unspoiled_nodes"       # The raw file in the current directory
    output_file = "cleaned_nodes.csv"    # Your desired output CSV
    clean_unspoiled_data(input_file, output_file)
    print(f"Processed '{input_file}' and wrote results to '{output_file}'.")


Processed 'unspoiled_nodes' and wrote results to 'cleaned_nodes.csv'.


In [None]:
import pandas as pd
import json
import re

# Load the JSON file with item IDs.
with open("item_ids.json", "r", encoding="utf-8") as f:
    item_json = json.load(f)

# Build a mapping: lower-case English item name -> item ID
item_mapping = {}
for item_id, names in item_json.items():
    en_name = names.get("en", "").strip().lower()
    item_mapping[en_name] = item_id

# Read in the cleaned nodes CSV.
nodes_df = pd.read_csv("cleaned_nodes.csv")

# Function to clean item names for matching:
# Remove occurrences of "(Rare)" (case-insensitive), then lower-case and strip.
def clean_item_name(name):
    # Remove the substring (Rare) along with any extra spaces
    name_clean = re.sub(r'\s*\(rare\)', '', name, flags=re.IGNORECASE)
    return name_clean.strip().lower()

nodes_df["Item Name Clean"] = nodes_df["Item Name"].apply(clean_item_name)

# Function to look up the item ID using the cleaned item name.
def get_item_id(row):
    name = row["Item Name Clean"]
    if name in item_mapping:
        return item_mapping[name]
    else:
        print(f"Error: No ID found for item '{row['Item Name']}' (cleaned as '{name}').")
        return None

# Apply the lookup function to each row.
nodes_df["ID"] = nodes_df.apply(get_item_id, axis=1)

# Reorder columns to prepend the ID.
final_df = nodes_df[["ID", "Time", "Item Name", "Location", "Coordinates"]]

# Write the final merged CSV.
final_df.to_csv("final_nodes_with_ids.csv", index=False)
print("Merged CSV written to 'final_nodes_with_ids.csv'.")



Error: No ID found for item 'Waterfowl Feather (Rare)' (cleaned as 'waterfowl feather (rare)').
Error: No ID found for item 'Mazlaya Greens (Rare)' (cleaned as 'mazlaya greens (rare)').
Error: No ID found for item 'Dzemael Tomato Seeds (Rare)' (cleaned as 'dzemael tomato seeds (rare)').
Merged CSV written to 'final_nodes_with_ids.csv'.
