In [None]:
# Optional: install packages in the notebook environment
# Uncomment and run if you need to install dependencies:
# !pip install pandas ipywidgets scikit-learn

In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
from pathlib import Path

# sklearn imports are optional (TF-IDF search). We'll try to import and fall back gracefully.
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import linear_kernel
    SKLEARN_AVAILABLE = True
except Exception:
    SKLEARN_AVAILABLE = False

# Repository-relative paths (adjust if your files live elsewhere)
BASE = Path('.')
TICKERS_PATH = BASE / 'FinancialWebScrapers' / 'tickers.json'
DESCRIPTIONS_PATH = BASE / 'FinancialWebScrapers' / 'us-gaap-descriptions.json'
KEYATTR_PATH = BASE / 'FinancialWebScrapers' / 'keyAttributes.json'

In [None]:
def _load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_data():
    # Load ticker/master list if available
    tickers = []
    if TICKERS_PATH.exists():
        data = _load_json(TICKERS_PATH)
        if isinstance(data, dict):
            # dict-of-tickers or metadata; try to convert to list
            try:
                tickers = [v for v in data.values() if isinstance(v, dict)]
            except Exception:
                tickers = []
        elif isinstance(data, list):
            tickers = data

    # Load descriptions into a mapping: ticker -> description
    desc_map = {}
    if DESCRIPTIONS_PATH.exists():
        d = _load_json(DESCRIPTIONS_PATH)
        if isinstance(d, dict):
            # assume mapping ticker -> description OR name -> description
            for k, v in d.items():
                if isinstance(v, str):
                    desc_map[k.upper()] = v
        elif isinstance(d, list):
            for item in d:
                if isinstance(item, dict):
                    # common keys: 'ticker', 'symbol', 'description', 'text'
                    ticker = item.get('ticker') or item.get('symbol') or item.get('ciK') or item.get('cik')
                    desc = item.get('description') or item.get('text') or item.get('value')
                    if ticker and desc and isinstance(desc, str):
                        desc_map[str(ticker).upper()] = desc

    # Try keyAttributes.json as well for descriptions if present
    if KEYATTR_PATH.exists():
        k = _load_json(KEYATTR_PATH)
        if isinstance(k, dict):
            # some formats may contain entries with 'ticker' and 'description'
            for entry in k.get('items', []) if 'items' in k else (k.get('companies', []) if 'companies' in k else []):
                if isinstance(entry, dict):
                    t = entry.get('ticker') or entry.get('symbol')
                    desc = entry.get('description')
                    if t and desc:
                        desc_map[str(t).upper()] = desc

    # Build DataFrame from tickers list if available, else try to build from desc_map
    rows = []
    if tickers:
        for item in tickers:
            if not isinstance(item, dict):
                continue
            ticker = (item.get('ticker') or item.get('symbol') or item.get('Ticker') or item.get('symbol'))
            if ticker is None:
                # try keys that look like tickers
                for k in item.keys():
                    if isinstance(k, str) and len(k) <= 5 and k.isupper():
                        ticker = k
                        break
            name = item.get('name') or item.get('company') or item.get('longName') or item.get('Company Name')
            exchange = item.get('exchange') or item.get('Exchange') or item.get('market')
            desc = None
            if ticker:
                desc = desc_map.get(str(ticker).upper())
            # if item itself contains a description field, prefer it
            if not desc:
                desc = item.get('description') or item.get('text')
            rows.append({'ticker': (str(ticker).upper() if ticker else None), 'name': name, 'exchange': exchange, 'description': desc})
    else:
        # Fall back: create rows from desc_map only
        for t, desc in desc_map.items():
            rows.append({'ticker': t, 'name': None, 'exchange': None, 'description': desc})

    df = pd.DataFrame(rows)
    # Normalize columns
    if 'exchange' in df.columns:
        df['exchange'] = df['exchange'].fillna('').astype(str)
    else:
        df['exchange'] = ''
    df['ticker'] = df['ticker'].astype('string')
    df['name'] = df['name'].astype('string')
    df['description'] = df['description'].fillna('').astype('string')
    # Filter to NYSE listings (case-insensitive substring match)
    mask = df['exchange'].str.contains('NYSE', case=False, na=False)
    df_nyse = df[mask].copy() if mask.any() else df.copy()
    df_nyse.reset_index(drop=True, inplace=True)
    return df_nyse

In [None]:
# Load data (this will try repo JSON files and build a table)
df = load_data()
print(f'Loaded {len(df)} companies (filtered to NYSE where available).')
df.head(5)

In [None]:
def simple_search(df, query, regex=False):
    if not query:
        return df.iloc[0:0]
    q = str(query)
    if regex:
        mask = df['description'].str.contains(q, case=False, regex=True, na=False)
        counts = df['description'].str.count(q, flags=re.IGNORECASE).fillna(0).astype(int)
    else:
        esc = re.escape(q)
        mask = df['description'].str.contains(esc, case=False, regex=True, na=False)
        counts = df['description'].str.count(esc, flags=re.IGNORECASE).fillna(0).astype(int)
    res = df[mask].copy()
    res['matches'] = counts[mask].values
    res = res.sort_values('matches', ascending=False)
    return res

def tfidf_search(df, query, top_n=50):
    if not SKLEARN_AVAILABLE:
        raise RuntimeError('scikit-learn is required for TF-IDF search')
    docs = df['description'].fillna('').astype(str).tolist()
    vect = TfidfVectorizer(stop_words='english')
    tfidf = vect.fit_transform(docs)
    qv = vect.transform([query])
    sim = linear_kernel(qv, tfidf).flatten()
    idx = np.argsort(sim)[::-1][:top_n]
    res = df.iloc[idx].copy()
    res['score'] = sim[idx]
    return res

In [None]:
# Interactive widgets UI (ipywidgets)
from IPython.display import display
import ipywidgets as widgets

query = widgets.Text(description='Query', placeholder='Enter keyword or phrase')
method = widgets.Dropdown(options=['Simple', 'Regex', 'TF-IDF'], value='Simple', description='Method')
top_n = widgets.IntSlider(value=20, min=1, max=200, description='Top')
search_btn = widgets.Button(description='Search', button_style='primary')
export_btn = widgets.Button(description='Export CSV')
out = widgets.Output(layout={'max_height':'400px', 'overflow':'auto'})

def _do_search(_):
    out.clear_output()
    q = query.value.strip()
    if not q:
        with out:
            print('Please enter a query string')
        return
    try:
        if method.value == 'Simple':
            res = simple_search(df, q, regex=False)
            display_cols = ['ticker', 'name', 'description', 'matches']
        elif method.value == 'Regex':
            res = simple_search(df, q, regex=True)
            display_cols = ['ticker', 'name', 'description', 'matches']
        else:
            if not SKLEARN_AVAILABLE:
                with out:
                    print('scikit-learn not available; install it to use TF-IDF search')
                return
            res = tfidf_search(df, q, top_n=top_n.value)
            display_cols = ['ticker', 'name', 'description', 'score']
        with out:
            if res.empty:
                print('No matches found')
            else:
                display(res[display_cols].head(top_n.value))
        # attach last results for export
        out._last_results = res
    except Exception as e:
        with out:
            print('Search error:', e)

def _export(_):
    last = getattr(out, '_last_results', None)
    if last is None or last.empty:
        with out:
            print('No results to export')
        return
    fn = 'nyse_search_results.csv'
    last.to_csv(fn, index=False)
    with out:
        print(f'Exported {len(last)} rows to {fn}')

search_btn.on_click(_do_search)
export_btn.on_click(_export)

ui = widgets.HBox([query, method, top_n, search_btn, export_btn])
display(ui, out)

**Usage notes**:
- Start by running the import/load cells (run the notebook top-to-bottom).
- Use `Simple` for literal keyword matches, `Regex` for advanced patterns, and `TF-IDF` to find conceptually similar descriptions (requires `scikit-learn`).
- After running a search, click `Export CSV` to save results to `nyse_search_results.csv`.