In [65]:
import re
import csv

def clean_unspoiled_data(input_filename, output_filename):
    # Read the entire file as a string
    with open(input_filename, 'r', encoding='utf-8') as f:
        data = f.read()
    
    lines = data.split('\n')

    # We’ll store rows of [Time, Item Name, Location, Coordinates].
    rows = []

    for line in lines:
        line = line.strip()

        # We're looking for lines in the format:
        # |Time || {{item icon|Item Name}} || Slot || [[Location]] || (x..,y..) ...
        if not line.startswith('|'):
            continue
        
        # Split on '||'
        parts = [p.strip() for p in line.split('||')]
        if len(parts) < 5:
            continue

        # Extract the fields we care about:
        # parts[0] -> time  (remove leading '|')
        # parts[1] -> item
        # parts[3] -> location
        # parts[4] -> coordinate
        time = parts[0].lstrip('|').strip()
        item = parts[1]
        location = parts[3]
        coordinate = parts[4]

        # Combine entire line to detect questlink if needed
        entire_line = ' '.join(parts)

        # Skip if 'questlink' is in the line
        if re.search(r'questlink', entire_line, re.IGNORECASE):
            continue

        # Clean up the item name:
        # e.g., {{item icon|Broad Beans}} => Broad Beans
        item_clean = re.sub(r'\{\{.*?\|([^\}]+)\}\}', r'\1', item)
        # Remove leftover braces, "Collectable", "(Item)" text, etc.
        item_clean = re.sub(r'[{}]|\(Item\)|Collectable', '', item_clean).strip()

        # Skip if the item name has the word "cluster" (case-insensitive)
        if re.search(r'cluster', item_clean, re.IGNORECASE):
            continue

        # Clean up location, e.g. [[Il Mheg]] => Il Mheg
        location_clean = re.sub(r'\[\[|\]\]', '', location).strip()

        # Add the row
        rows.append([time, item_clean, location_clean, coordinate])

    # Write results to a CSV
    with open(output_filename, 'w', encoding='utf-8', newline='') as out_csv:
        writer = csv.writer(out_csv)
        writer.writerow(["Time", "Item Name", "Location", "Coordinates"])
        writer.writerows(rows)

if __name__ == "__main__":
    input_file = "unspoiled_nodes"       # The raw file in the current directory
    output_file = "cleaned_nodes.csv"    # Your desired output CSV
    clean_unspoiled_data(input_file, output_file)
    print(f"Processed '{input_file}' and wrote results to '{output_file}'.")


Processed 'unspoiled_nodes' and wrote results to 'cleaned_nodes.csv'.


In [66]:
import pandas as pd
import json
import re

# Load the JSON file with item IDs.
with open("item_ids.json", "r", encoding="utf-8") as f:
    item_json = json.load(f)

# Build a mapping: lower-case English item name -> item ID
item_mapping = {}
for item_id, names in item_json.items():
    en_name = names.get("en", "").strip().lower()
    item_mapping[en_name] = item_id

# Read in the cleaned nodes CSV.
nodes_df = pd.read_csv("cleaned_nodes.csv")

# Function to clean item names for matching:
# Remove occurrences of "(Rare)" (case-insensitive), then lower-case and strip.
def clean_item_name(name):
    # Remove the substring (Rare) along with any extra spaces
    name_clean = re.sub(r'\s*\(rare\)', '', name, flags=re.IGNORECASE)
    return name_clean.strip().lower()

nodes_df["Item Name Clean"] = nodes_df["Item Name"].apply(clean_item_name)

# Function to look up the item ID using the cleaned item name.
def get_item_id(row):
    name = row["Item Name Clean"]
    if name in item_mapping:
        return item_mapping[name]
    else:
        print(f"Error: No ID found for item '{row['Item Name']}' (cleaned as '{name}').")
        return None

# Apply the lookup function to each row.
nodes_df["ID"] = nodes_df.apply(get_item_id, axis=1)

# Reorder columns to prepend the ID.
final_df = nodes_df[["ID", "Time", "Item Name", "Location", "Coordinates"]]

# Write the final merged CSV.
final_df.to_csv("final_nodes_with_ids.csv", index=False)
print("Merged CSV written to 'final_nodes_with_ids.csv'.")


Merged CSV written to 'final_nodes_with_ids.csv'.


In [67]:
import pandas as pd
from datetime import datetime, timedelta
import time
import re

# -----------------------
# 1. Compute current Eorzean time in 24-hour format
# -----------------------
local_epoch = int(time.time() * 1000)
epoch = local_epoch * 20.571428571428573
minutes = int((epoch / (1000 * 60)) % 60)
hours_24 = int((epoch / (1000 * 60 * 60)) % 24)

# Format as 24-hour time (e.g. "13:59")
et_time_str = f"{hours_24:02d}:{minutes:02d}"
print("Current Eorzean time (raw):", et_time_str)

# -----------------------
# 2. Round up to the next hour if minutes > 0 (using 24-hour format)
# -----------------------
current_time = datetime.strptime(et_time_str, "%H:%M")
if current_time.minute > 0:
    threshold_time = current_time.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
else:
    threshold_time = current_time

print("Threshold Eorzean time (rounded):", threshold_time.strftime("%H:%M"))

# -----------------------
# 3. Load and process final_nodes_with_ids.csv, duplicating ambiguous AM/PM entries.
# -----------------------
df = pd.read_csv("final_nodes_with_ids.csv")

# We'll build a new list of rows (as dicts) so we can duplicate those with "AM/PM".
new_rows = []

def standard_parse_time(time_str):
    """
    Parse a time string that does not contain the ambiguous 'AM/PM'.
    If the string contains "AM" or "PM" (but not "AM/PM"), it is parsed as a 12-hour clock.
    Otherwise, it's assumed to be already in 24-hour format.
    """
    if re.search(r'\b(AM|PM)\b', time_str, re.IGNORECASE) and "AM/PM" not in time_str.upper():
        # Ensure a space between the digits and the AM/PM part if missing.
        time_str_fixed = re.sub(r'(\d)(AM|PM)', r'\1 \2', time_str, flags=re.IGNORECASE)
        return datetime.strptime(time_str_fixed, "%I:%M %p")
    else:
        return datetime.strptime(time_str, "%H:%M")

for idx, row in df.iterrows():
    time_str = row["Time"].strip()
    # Check for ambiguous "AM/PM" marker (case-insensitive).
    if "AM/PM" in time_str.upper():
        # Remove the marker and trim spaces.
        base_time_str = re.sub(r'\s*AM/PM', '', time_str, flags=re.IGNORECASE).strip()
        # Parse the base time assuming it's in 24-hour notation.
        base_time = datetime.strptime(base_time_str, "%H:%M")
        # For ambiguous times, produce two interpretations:
        # For 12:00, use 00:00 for the AM version and keep 12:00 for the PM version.
        if base_time.hour == 12:
            am_time = base_time.replace(hour=0)
            pm_time = base_time
        else:
            am_time = base_time
            pm_time = base_time + timedelta(hours=12)
        
        # Duplicate the row for the AM version.
        row_am = row.copy()
        row_am["Parsed Time"] = am_time
        row_am["Time"] = am_time.strftime("%H:%M")
        new_rows.append(row_am)
        
        # Duplicate the row for the PM version.
        row_pm = row.copy()
        row_pm["Parsed Time"] = pm_time
        row_pm["Time"] = pm_time.strftime("%H:%M")
        new_rows.append(row_pm)
    else:
        # For non-ambiguous times, use the standard parser.
        parsed = standard_parse_time(time_str)
        row_new = row.copy()
        row_new["Parsed Time"] = parsed
        row_new["Time"] = parsed.strftime("%H:%M")
        new_rows.append(row_new)

# Create a new DataFrame from the processed rows.
df_new = pd.DataFrame(new_rows)

# -----------------------
# 4. Adjust times for wrapping past midnight and filter for the next 24 hours
# -----------------------
def effective_time(row_time, threshold):
    # If the event time is earlier than the threshold, assume it's on the next day.
    return row_time + timedelta(days=1) if row_time < threshold else row_time

df_new["Effective Time"] = df_new["Parsed Time"].apply(lambda t: effective_time(t, threshold_time))

# Define the end of the 24-hour window.
end_time = threshold_time + timedelta(days=1)

# Filter for events whose effective time is in the next 24 hours.
df_filtered = df_new[(df_new["Effective Time"] >= threshold_time) & (df_new["Effective Time"] < end_time)]

# Sort by the effective time.
df_sorted = df_filtered.sort_values(by="Effective Time")

print("\nItems scheduled in the next 24 hours (24-hour military format):")
print(df_sorted[["ID", "Time", "Item Name", "Location", "Coordinates"]])
df_sorted.to_csv("final_nodes_with_ids_sorted.csv", index=False)


Current Eorzean time (raw): 22:59
Threshold Eorzean time (rounded): 23:00

Items scheduled in the next 24 hours (24-hour military format):
        ID   Time                    Item Name                Location  \
0    27833  00:00                  Broad Beans                 Il Mheg   
71   27729  00:00                 Raw Triplite              Amh Araeng   
1    27828  00:00                 Mist Spinach  The Rak'tika Greatwood   
72   27731  00:00                     Raw Onyx             The Tempest   
26   12899  00:00                      Porcini      The Churning Mists   
..     ...    ...                          ...                     ...   
102  12901  22:00         Abalathian Rock Salt       The Sea of Clouds   
95   32987  22:00  Rarefied Gyr Abanian Alumen             The Fringes   
11   33004  22:00   Rarefied Miracle Apple Log                 Il Mheg   
10   33002  22:00         Rarefied Pixie Apple                 Il Mheg   
22   19860  22:00                 Bamboo Shoot 

In [68]:
# requires internet connection and running universalis api
# send a request to the universalis API for each of the next 10 items on the final_nodes_with_ids_sorted.csv
# append to each line the value of the expected request

import pandas as pd
import requests
from datetime import datetime, timedelta

# -----------------------
# 1. Read the previously generated sorted CSV and take the first 10 rows.
# -----------------------
df_sorted = pd.read_csv("final_nodes_with_ids_sorted.csv")
df_top10 = df_sorted.head(10).copy()

# Define new column names for the market data we want to add.
market_columns = [
    "minListing_world", 
    "minListing_dc", 
    "recentPurchase_world", 
    "recentPurchase_dc", 
    "averageSalePrice_dc", 
    "dailySaleVelocity_dc"
]

# Initialize the new columns with None.
for col in market_columns:
    df_top10[col] = None

# -----------------------
# 2. Set default world and define a function to fetch market data.
# -----------------------
world = "Seraph"

def fetch_market_data(item_id, world):
    url = f"https://universalis.app/api/v2/aggregated/{world}/{item_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if "results" in data and len(data["results"]) > 0:
                result = data["results"][0]
                # Extract the required market values.
                minListing_world = result.get("nq", {}).get("minListing", {}).get("world", {}).get("price")
                minListing_dc = result.get("nq", {}).get("minListing", {}).get("dc", {}).get("price")
                recentPurchase_world = result.get("nq", {}).get("recentPurchase", {}).get("world", {}).get("price")
                recentPurchase_dc = result.get("nq", {}).get("recentPurchase", {}).get("dc", {}).get("price")
                averageSalePrice_dc = result.get("nq", {}).get("averageSalePrice", {}).get("dc", {}).get("price")
                dailySaleVelocity_dc = result.get("nq", {}).get("dailySaleVelocity", {}).get("dc", {}).get("quantity")
                return {
                    "minListing_world": minListing_world,
                    "minListing_dc": minListing_dc,
                    "recentPurchase_world": recentPurchase_world,
                    "recentPurchase_dc": recentPurchase_dc,
                    "averageSalePrice_dc": averageSalePrice_dc,
                    "dailySaleVelocity_dc": dailySaleVelocity_dc,
                }
            else:
                print(f"No results found for item ID {item_id}")
        else:
            print(f"Error fetching data for item ID {item_id}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Exception for item ID {item_id}: {e}")
    # Return a dict with None values if something went wrong.
    return {col: None for col in market_columns}

# -----------------------
# 3. Loop through the first 10 rows, fetch market data for each item, and update the DataFrame.
# -----------------------
for idx, row in df_top10.iterrows():
    # Assuming the column "ID" in the CSV corresponds to the ItemID.
    item_id = row["ID"]
    market_data = fetch_market_data(item_id, world)
    for key, value in market_data.items():
        df_top10.at[idx, key] = value

# -----------------------
# 4. Save the augmented DataFrame to a new CSV file.
# -----------------------
df_top10.to_csv("final_nodes_with_ids_market.csv", index=False)
print("CSV file with market data saved as 'final_nodes_with_ids_market.csv'.")


Error fetching data for item ID 32996. Status code: 400
Error fetching data for item ID 32995. Status code: 400
CSV file with market data saved as 'final_nodes_with_ids_market.csv'.


In [69]:
import pandas as pd

# Read the final CSV with market data (20 rows)
df_market = pd.read_csv("final_nodes_with_ids_market.csv")

from IPython.display import display
display(df_market)


Unnamed: 0,ID,Time,Item Name,Location,Coordinates,Parsed Time,Effective Time,minListing_world,minListing_dc,recentPurchase_world,recentPurchase_dc,averageSalePrice_dc,dailySaleVelocity_dc
0,27833,00:00,Broad Beans,Il Mheg,"(x24,y36)",1900-01-01 00:00:00,1900-01-02 00:00:00,11.0,7.0,19.0,150.0,,
1,27729,00:00,Raw Triplite,Amh Araeng,"(x20,y29)",1900-01-01 00:00:00,1900-01-02 00:00:00,50.0,5.0,60.0,5.0,5.0,2.332169
2,27828,00:00,Mist Spinach,The Rak'tika Greatwood,"(x34,y21)",1900-01-01 00:00:00,1900-01-02 00:00:00,46.0,8.0,40.0,100.0,131.314917,46.90247
3,27731,00:00,Raw Onyx,The Tempest,"(x16,y21)",1900-01-01 00:00:00,1900-01-02 00:00:00,,1000.0,2000.0,500.0,1456.841667,93.286545
4,12899,00:00,Porcini,The Churning Mists,"(x24,y6)",1900-01-01 00:00:00,1900-01-02 00:00:00,73.0,5.0,42.0,42.0,70.333333,10.883423
5,12943,00:00,Dravanian Mistletoe,The Churning Mists,"(x24,y6)",1900-01-01 00:00:00,1900-01-02 00:00:00,800.0,300.0,900.0,115.0,711.962437,255.241985
6,32996,00:00,Rarefied Dark Chestnut Resin,The Dravanian Forelands,"(x16,y36)",1900-01-01 00:00:00,1900-01-02 00:00:00,,,,,,
7,32995,00:00,Rarefied Dark Chestnut,The Dravanian Forelands,"(x16,y36)",1900-01-01 00:00:00,1900-01-02 00:00:00,,,,,,
8,19907,00:00,Ala Mhigan Salt Crystal,The Lochs,"(x21,y29)",1900-01-01 00:00:00,1900-01-02 00:00:00,355.0,4.0,375.0,375.0,391.607422,132.673739
9,19970,00:00,Raw Star Spinel,The Ruby Sea,"(x15,y5)",1900-01-01 00:00:00,1900-01-02 00:00:00,22.0,5.0,175.0,5.0,5.0,2.591282


In [70]:
import pandas as pd
import requests
from datetime import datetime, timedelta

# -----------------------
# 1. Read the previously generated sorted CSV and take the first 10 rows that do NOT contain "Rarefied" in the Item Name.
# -----------------------
df_sorted = pd.read_csv("final_nodes_with_ids_sorted.csv")

# Filter out rows that have "Rarefied" in the Item Name (case-insensitive)
df_filtered = df_sorted[~df_sorted["Item Name"].str.contains("Rarefied", case=False, na=False)]
df_top10 = df_filtered.head(10).copy()

# Define new column names for the market data we want to add.
market_columns = [
    "minListing_world", 
    "minListing_dc", 
    "recentPurchase_world", 
    "recentPurchase_dc", 
    "averageSalePrice_dc", 
    "dailySaleVelocity_dc"
]

# Initialize the new columns with None.
for col in market_columns:
    df_top10[col] = None

# -----------------------
# 2. Set default world and define a function to fetch market data.
# -----------------------
world = "Seraph"

def fetch_market_data(item_id, world):
    url = f"https://universalis.app/api/v2/aggregated/{world}/{item_id}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if "results" in data and len(data["results"]) > 0:
                result = data["results"][0]
                # Extract the required market values.
                minListing_world = result.get("nq", {}).get("minListing", {}).get("world", {}).get("price")
                minListing_dc = result.get("nq", {}).get("minListing", {}).get("dc", {}).get("price")
                recentPurchase_world = result.get("nq", {}).get("recentPurchase", {}).get("world", {}).get("price")
                recentPurchase_dc = result.get("nq", {}).get("recentPurchase", {}).get("dc", {}).get("price")
                averageSalePrice_dc = result.get("nq", {}).get("averageSalePrice", {}).get("dc", {}).get("price")
                dailySaleVelocity_dc = result.get("nq", {}).get("dailySaleVelocity", {}).get("dc", {}).get("quantity")
                return {
                    "minListing_world": minListing_world,
                    "minListing_dc": minListing_dc,
                    "recentPurchase_world": recentPurchase_world,
                    "recentPurchase_dc": recentPurchase_dc,
                    "averageSalePrice_dc": averageSalePrice_dc,
                    "dailySaleVelocity_dc": dailySaleVelocity_dc,
                }
            else:
                print(f"No results found for item ID {item_id}")
        else:
            print(f"Error fetching data for item ID {item_id}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Exception for item ID {item_id}: {e}")
    # Return a dict with None values if something went wrong.
    return {col: None for col in market_columns}

# -----------------------
# 3. Loop through the first 10 rows, fetch market data for each item, and update the DataFrame.
# -----------------------
for idx, row in df_top10.iterrows():
    # Assuming the column "ID" in the CSV corresponds to the ItemID.
    item_id = row["ID"]
    market_data = fetch_market_data(item_id, world)
    for key, value in market_data.items():
        df_top10.at[idx, key] = value

# -----------------------
# 4. Save the augmented DataFrame to a new CSV file.
# -----------------------
df_top10.to_csv("final_nodes_with_ids_market.csv", index=False)
print("CSV file with market data saved as 'final_nodes_with_ids_market.csv'.")


CSV file with market data saved as 'final_nodes_with_ids_market.csv'.


In [71]:
import pandas as pd
from IPython.display import display

# Read the final CSV with market data.
df_market = pd.read_csv("final_nodes_with_ids_market.csv")

# Ensure we only display rows that do NOT have "Rarefied" in the Item Name.
df_market_filtered = df_market[~df_market["Item Name"].str.contains("Rarefied", case=False, na=False)]

# Display the filtered DataFrame.
display(df_market_filtered)


Unnamed: 0,ID,Time,Item Name,Location,Coordinates,Parsed Time,Effective Time,minListing_world,minListing_dc,recentPurchase_world,recentPurchase_dc,averageSalePrice_dc,dailySaleVelocity_dc
0,27833,00:00,Broad Beans,Il Mheg,"(x24,y36)",1900-01-01 00:00:00,1900-01-02 00:00:00,11.0,7,19,150,,
1,27729,00:00,Raw Triplite,Amh Araeng,"(x20,y29)",1900-01-01 00:00:00,1900-01-02 00:00:00,50.0,5,60,5,5.0,2.332151
2,27828,00:00,Mist Spinach,The Rak'tika Greatwood,"(x34,y21)",1900-01-01 00:00:00,1900-01-02 00:00:00,46.0,8,40,100,131.314917,46.902108
3,27731,00:00,Raw Onyx,The Tempest,"(x16,y21)",1900-01-01 00:00:00,1900-01-02 00:00:00,,1000,2000,500,1456.841667,93.285901
4,12899,00:00,Porcini,The Churning Mists,"(x24,y6)",1900-01-01 00:00:00,1900-01-02 00:00:00,73.0,5,42,42,70.333333,10.883347
5,12943,00:00,Dravanian Mistletoe,The Churning Mists,"(x24,y6)",1900-01-01 00:00:00,1900-01-02 00:00:00,800.0,300,900,115,711.962437,255.240221
6,19907,00:00,Ala Mhigan Salt Crystal,The Lochs,"(x21,y29)",1900-01-01 00:00:00,1900-01-02 00:00:00,355.0,4,375,375,391.607422,132.673008
7,19970,00:00,Raw Star Spinel,The Ruby Sea,"(x15,y5)",1900-01-01 00:00:00,1900-01-02 00:00:00,22.0,5,175,5,5.0,2.591268
8,12538,00:00,Adamantite Ore,Azys Lla,"(x24,y6)",1900-01-01 00:00:00,1900-01-02 00:00:00,515.0,95,519,880,643.249723,467.98267
9,5121,01:00,Darksteel Ore,Coerthas Central Highlands,"(x27,y19)",1900-01-01 01:00:00,1900-01-02 01:00:00,777.0,530,667,800,682.480417,10248.191723
