# Market Data Stripper - Jupyter Notebook Version

In [18]:
try:
    import flask
    import PIL
    import easyocr
    import numpy
    import requests
    import concurrent.futures
    print("All required libraries are already installed!")
except ImportError:
    !pip install flask pillow easyocr numpy requests
    import concurrent.futures
    print("Installed required libraries.")

# Initialize EasyOCR reader once at startup to download models
print("Downloading EasyOCR models (this may take a few minutes)...")
try:
    reader = easyocr.Reader(['en'])
    print("EasyOCR models downloaded successfully!")
except Exception as e:
    print(f"Error downloading EasyOCR models: {e}")



All required libraries are already installed!
Downloading EasyOCR models (this may take a few minutes)...
EasyOCR models downloaded successfully!


## Imports

In [19]:
from flask import Flask, request, send_from_directory, redirect, url_for
from PIL import Image, ImageEnhance, ImageFilter
import os
import re
import difflib
import easyocr
import numpy as np
import csv
import requests
import base64
from IPython.display import display, HTML
import ipywidgets as widgets
from io import BytesIO
from google.colab import userdata
import concurrent.futures
import threading

## Configuration

In [20]:
# In a Colab notebook, __file__ is not defined. Use the current working directory instead.
UPLOAD_FOLDER = os.path.join(os.getcwd(), 'uploads')
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# Maximum number of parallel workers (adjust based on your system)
MAX_WORKERS = 4

# Global EasyOCR reader with thread lock
reader_lock = threading.Lock()
reader = None

CROP_BOX_PRESETS = {
    "edu_Phone": {
        "item_name": (1079, 521, 1635, 578),
        "buy_price": (1079, 662, 1635, 726),
        "sell_price": (1079, 811, 1635, 876),
        "avg_price": (1079, 963, 1635, 1030),
    },
    "edu_PC": {
        "item_name": (1, 84, 1000, 136),
        "buy_price": (1, 160, 1000, 190),
        "sell_price": (1, 218, 1000, 250),
        "avg_price": (1, 278, 1000, 310),
    },
    "edu_PC_Uncut": {
        "item_name": (690, 307, 1325, 361),
        "buy_price": (690, 390, 1325, 448),
        "sell_price": (690, 480, 1325, 538),
        "avg_price": (690, 570, 1325, 640),
    },
}

## Helper Functions

In [21]:
def initialize_reader():
    """Initialize the EasyOCR reader with thread safety"""
    global reader
    with reader_lock:
        if reader is None:
            try:
                print("Initializing EasyOCR reader...")
                reader = easyocr.Reader(['en'])
                print("EasyOCR reader initialized successfully!")
            except Exception as e:
                print(f"Failed to initialize EasyOCR reader: {e}")
                raise
    return reader

def fuzzy_find_keyword(line, keywords, cutoff=0.7):
    words = line.lower().split()
    for word in words:
        matches = difflib.get_close_matches(word, keywords, n=1, cutoff=cutoff)
        if matches:
            return matches[0]
    return None

def get_price_from_lines(lines, keywords, cutoff=0.5):
    for i, line in enumerate(lines):
        found = fuzzy_find_keyword(line, keywords, cutoff=cutoff)
        if found:
            nums = re.findall(r'[\d.,]+', line)
            if nums:
                return nums[-1]
            if i + 1 < len(lines):
                nums_next = re.findall(r'[\d.,]+', lines[i + 1])
                if nums_next:
                    return nums_next[-1]
    return None

def extract_first_number(text):
    matches = re.findall(r'[\d.,]+', text)
    if matches:
        return matches[0]
    return ''

def extract_data(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    lines = filter_ocr_lines(lines)
    raw_item_name = lines[0] if lines else "Not found"
    print(f"OCR raw item name: '{raw_item_name}'", flush=True)
    return raw_item_name, None, None, None

def filter_ocr_lines(lines):
    rarity_keywords = {"common", "uncommon", "rare", "epic", "legendary"}
    skip_keywords = {"supply volume:", "demand volume:", "inventory", "bazaar", "click to view details!", "bazaai"}
    filtered = []
    for line in lines:
        l = line.strip()
        l_lower = l.lower()
        # Remove rarity keywords if present as a word
        for rarity in rarity_keywords:
            pattern = r'\b' + re.escape(rarity) + r'\b'
            l = re.sub(pattern, '', l, flags=re.IGNORECASE)
        # Remove skip keywords/phrases if present
        for skip in skip_keywords:
            l = l.replace(skip, '').replace(skip.capitalize(), '')
        # Remove numbers after supply/demand volume
        l = re.sub(r'(supply volume:|demand volume:)\s*\d{1,3}(,\d{3})*', '', l, flags=re.IGNORECASE)
        # Remove extra spaces
        l = re.sub(r'\s+', ' ', l).strip()
        if l:  # Only add non-empty lines
            filtered.append(l)
    return filtered

def render_csv_table(csv_file):
    if not os.path.isfile(csv_file):
        return "<b>No CSV file found.</b>"
    html = "<table border='1' style='border-collapse:collapse;'>"
    with open(csv_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            cells = [c.strip() for c in line.strip().split(',')]
            tag = "th" if i == 0 else "td"
            html += "<tr>" + "".join(f"<{tag}>{cell}</{tag}>" for cell in cells) + "</tr>"
    html += "</table>"
    return html

def round_price(price):
    try:
        return str(int(round(float(price.replace(',', '')))))
    except Exception:
        return price or ''

## Main Processing Functions (Improved Parallel Version)

In [22]:
def process_single_image(filename, file_content, preset):
    """Process a single image and return its results"""
    CROP_BOXES = CROP_BOX_PRESETS.get(preset, CROP_BOX_PRESETS['edu_Phone'])
    filepath = os.path.join(UPLOAD_FOLDER, filename)

    # Save the file temporarily
    with open(filepath, 'wb') as f:
        f.write(file_content)

    extracted_texts = {}
    cropped_filenames = {}
    result_html = ""

    try:
        # Initialize reader if not already done
        current_reader = initialize_reader()
        if current_reader is None:
            raise Exception("EasyOCR reader not initialized")

        img = Image.open(filepath)
        CROP_MARGIN = 0
        width, height = img.size
        left = CROP_MARGIN
        top = CROP_MARGIN
        right = width - CROP_MARGIN
        bottom = height - CROP_MARGIN
        img = img.crop((left, top, right, bottom))

        for key, box in CROP_BOXES.items():
            section = img.crop(box)
            section = section.resize((section.width * 2, section.height * 2), Image.LANCZOS)
            section = section.convert('L')
            section = section.filter(ImageFilter.MedianFilter(size=3))
            section = ImageEnhance.Contrast(section).enhance(2.5)
            cropped_filename = f"cropped_{key}_{os.path.splitext(filename)[0]}.png"
            cropped_filepath = os.path.join(UPLOAD_FOLDER, cropped_filename)
            section.save(cropped_filepath, format='PNG')
            cropped_filenames[key] = cropped_filename

            # Use EasyOCR to extract text with thread safety
            with reader_lock:
                result = current_reader.readtext(np.array(section), detail=0, paragraph=True)
            extracted_texts[key] = " ".join(result).strip()
    except Exception as e:
        for key in CROP_BOXES:
            extracted_texts[key] = f"Error: {str(e)}"

    # HTML for displaying all cropped images and OCR results for this file
    for key in ["item_name", "buy_price", "sell_price", "avg_price"]:
        if cropped_filenames.get(key):
            result_html += f"<b>{key.replace('_', ' ').title()} ({filename}):</b><br>"
            result_html += f'<pre>OCR: {extracted_texts.get(key, "")}</pre><hr>'

    # Get OCR text for each field
    item_name_ocr = extracted_texts.get("item_name", "")
    buy_price_ocr = extracted_texts.get("buy_price", "")
    sell_price_ocr = extracted_texts.get("sell_price", "")
    avg_price_ocr = extracted_texts.get("avg_price", "")

    matched_item_name, _, _, _ = extract_data(item_name_ocr)

    def extract_last_number(text):
        matches = re.findall(r'[\d.,]+', text)
        if matches:
            return matches[-1]
        return ''

    buy_price = extract_first_number(buy_price_ocr)
    sell_price = extract_first_number(sell_price_ocr)
    avg_price = extract_last_number(avg_price_ocr)

    matched_item_name_html = f"<b>Item Name:</b> {matched_item_name}<br>" if item_name_ocr else ""
    prices_html = f"""
    <b>Buy Price:</b> {buy_price or ''}<br>
    <b>Sell Price:</b> {sell_price or ''}<br>
    <b>7d Avg Price:</b> {avg_price or ''}<br>
    """

    result_html += f"<hr><b>Results for {filename}:</b><br>{matched_item_name_html}{prices_html}"

    # Clean up the temporary files
    try:
        os.remove(filepath)
        for cropped_file in cropped_filenames.values():
            os.remove(os.path.join(UPLOAD_FOLDER, cropped_file))
    except Exception as e:
        print(f"Failed to delete temporary files: {e}", flush=True)

    return {
        'filename': filename,
        'html': result_html,
        'data': {
            'item_name': matched_item_name,
            'buy_price': round_price(buy_price),
            'sell_price': round_price(sell_price),
            'avg_price': round_price(avg_price)
        }
    }

def process_images(files_dict, preset='edu_Phone'):
    """Process multiple images in parallel"""
    results_html = ""
    all_data = []

    # Initialize reader before parallel processing
    try:
        initialize_reader()
    except Exception as e:
        return f"<p style='color:red'>Error initializing OCR: {str(e)}</p>"

    # Process images in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = []
        for filename, file_info in files_dict.items():
            futures.append(executor.submit(
                process_single_image,
                filename=filename,
                file_content=file_info['content'],
                preset=preset
            ))

        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                results_html += result['html']
                all_data.append(result['data'])
            except Exception as e:
                results_html += f"<p style='color:red'>Error processing image: {str(e)}</p>"

    # Save all data to CSV
    csv_file = os.path.join(os.getcwd(), "extracted_data.csv")
    file_exists = os.path.isfile(csv_file)
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        if not file_exists:
            writer.writerow(['Item Name', 'Buy Price', 'Sell Price', '7d Avg Price'])
        for data in all_data:
            writer.writerow([
                data['item_name'],
                data['buy_price'],
                data['sell_price'],
                data['avg_price']
            ])

    # Display CSV table
    csv_table_html = render_csv_table(csv_file)

    return results_html + csv_table_html

## Notebook UI Components

In [23]:
# File upload widget
uploader = widgets.FileUpload(
    accept='image/*',
    multiple=True
)

# Preset selector
preset_selector = widgets.Dropdown(
    options=list(CROP_BOX_PRESETS.keys()),
    value='edu_Phone',
    description='Crop Preset:'
)

# Process button
process_btn = widgets.Button(description="Process Images")

# Output area
output_area = widgets.Output()

# Action buttons
add_to_mprices_btn = widgets.Button(description="Add to mprices.txt")
clear_csv_btn = widgets.Button(description="Clear CSV")
sync_github_btn = widgets.Button(description="Sync with GitHub")
publish_mprices_btn = widgets.Button(description="Publish mprices.txt")

## Button Handlers

In [24]:
def on_process_click(b):
    with output_area:
        output_area.clear_output()
        if uploader.value:
            display(HTML("<h3>Processing images...</h3>"))
            # Pass the entire dictionary to process_images
            result_html = process_images(uploader.value, preset_selector.value)
            display(HTML(result_html))
        else:
            display(HTML("<p>Please upload some images first.</p>"))

process_btn.on_click(on_process_click)

def add_to_mprices(b):
    with output_area:
        output_area.clear_output()
        csv_file = os.path.join(os.getcwd(), "extracted_data.csv")
        mprices_file = os.path.join(os.getcwd(), "mprices.txt")
        github_url = "https://raw.githubusercontent.com/CraftersMC-Guides-Project/guides-code/main/market/mprices.txt"

        # Check if mprices.txt exists, if not, download it from GitHub
        if not os.path.exists(mprices_file):
            display(HTML("<p>mprices.txt not found locally. Attempting to download from GitHub...</p>"))
            try:
                response = requests.get(github_url)
                response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
                with open(mprices_file, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                display(HTML("<p>Downloaded mprices.txt from GitHub successfully.</p>"))
            except requests.exceptions.RequestException as e:
                display(HTML(f"<p>Failed to download mprices.txt from GitHub: {e}<br>Proceeding with an empty local file.</p>"))
                # Create an empty file if download fails
                with open(mprices_file, 'w', encoding='utf-8') as f:
                    pass


        # Read all lines from mprices.txt
        try:
            with open(mprices_file, encoding='utf-8') as f:
                mprices_lines = f.readlines()
        except Exception as e:
            display(HTML(f"<p>Error reading mprices.txt: {e}</p>"))
            return


        # Read new items from extracted_data.csv
        new_items = []
        if os.path.isfile(csv_file):
            with open(csv_file, encoding='utf-8') as fin:
                reader = csv.reader(fin)
                for row in reader:
                    if not row or not row[0] or row[0].lower().startswith("item name"):
                        continue
                    item_name = row[0].strip()
                    buy = row[1].strip() if len(row) > 1 else ''
                    sell = row[2].strip() if len(row) > 2 else ''
                    new_items.append((item_name, buy, sell))

        # Update lines in mprices.txt where item name matches
        updated_lines = []
        for line in mprices_lines:
            updated = False
            for item_name, buy, sell in new_items:
                # Match item name inside ["..."]
                match = re.search(r'\["([^"]+)",\s*[\d,]+,\s*[\d,]+\]', line)
                if match and match.group(1).strip().lower() == item_name.lower():
                    # Get the line number prefix
                    prefix = line.split(':', 1)[0] if ':' in line else '' # Handle lines without a prefix
                    updated_line = f'{prefix}: ["{item_name}", {buy}, {sell}]\n' if prefix else f'["{item_name}", {buy}, {sell}]\n'
                    updated_lines.append(updated_line)
                    updated = True
                    break
            if not updated:
                updated_lines.append(line)

        # Write back to mprices.txt
        try:
            with open(mprices_file, 'w', encoding='utf-8') as f:
                f.writelines(updated_lines)
            display(HTML("<p>Added items to mprices.txt</p>"))
        except Exception as e:
             display(HTML(f"<p>Error writing to mprices.txt: {e}</p>"))


add_to_mprices_btn.on_click(add_to_mprices)

def clear_csv(b):
    with output_area:
        output_area.clear_output()
        # In a Colab notebook, __file__ is not defined. Use the current working directory instead.
        csv_file = os.path.join(os.getcwd(), "extracted_data.csv")
        # Optionally, keep the header row
        if os.path.isfile(csv_file):
            with open(csv_file, 'w', encoding='utf-8') as f:
                f.write('Item Name,Buy Price,Sell Price,7d Avg Price\n')
        display(HTML("<p>CSV cleared</p>"))

clear_csv_btn.on_click(clear_csv)

def sync_github(b):
    with output_area:
        output_area.clear_output()
        github_url = "https://raw.githubusercontent.com/CraftersMC-Guides-Project/guides-code/main/market/mprices.txt"
        # In a Colab notebook, __file__ is not defined. Use the current working directory instead.
        mprices_file = os.path.join(os.getcwd(), "mprices.txt")
        github_token = userdata.get('GITHUB_TOKEN')  # Set your token as an environment variable

        headers = {}
        if github_token:
            headers["Authorization"] = f"token {github_token}"

        try:
            response = requests.get(github_url, headers=headers)
            response.raise_for_status()
            with open(mprices_file, 'w', encoding='utf-8') as f:
                f.write(response.text)
            display(HTML("<p>Synced with GitHub successfully</p>"))
        except Exception as e:
            display(HTML(f"<p>Failed to sync with GitHub: {e}</p>"))

sync_github_btn.on_click(sync_github)

def publish_mprices(b):
    with output_area:
        output_area.clear_output()
        github_token = userdata.get('GITHUB_TOKEN')
        repo = "CraftersMC-Guides-Project/guides-code"
        branch = "main"
        path = "market/mprices.txt"
        # In a Colab notebook, __file__ is not defined. Use the current working directory instead.
        local_file = os.path.join(os.getcwd(), "mprices.txt")

        if not github_token:
            display(HTML("<p>No GitHub token found in environment variable GITHUB_TOKEN.</p>"))
            return

        # Get the SHA of the existing file (required for update)
        api_url = f"https://api.github.com/repos/{repo}/contents/{path}"
        headers = {
            "Authorization": f"token {github_token}",
            "Accept": "application/vnd.github.v3+json"
        }

        try:
            # Get current file SHA
            r = requests.get(api_url, headers=headers)
            r.raise_for_status()
            sha = r.json()["sha"]

            # Read local file and encode as base64
            with open(local_file, "rb") as f:
                content = base64.b64encode(f.read()).decode("utf-8")

            data = {
                "message": "Update mprices.txt via Data Stripper",
                "content": content,
                "branch": branch,
                "sha": sha
            }

            # Update the file on GitHub
            r = requests.put(api_url, json=data, headers=headers)
            r.raise_for_status()
            display(HTML("<p>Published to GitHub successfully</p>"))
        except Exception as e:
            display(HTML(f"<p>Failed to publish to GitHub: {e}</p>"))

publish_mprices_btn.on_click(publish_mprices)

## Display the UI

In [25]:
# Display the widgets
display(widgets.VBox([
    preset_selector,
    uploader,
    process_btn,
    widgets.HBox([add_to_mprices_btn, clear_csv_btn, sync_github_btn, publish_mprices_btn]),
    output_area
]))

VBox(children=(Dropdown(description='Crop Preset:', options=('edu_Phone', 'edu_PC', 'edu_PC_Uncut'), value='ed…