# 🔧 Step 1: Install required libraries
This cell installs pandas, unidecode, and ipywidgets for cleaning and interactive input.


In [10]:
!pip install pandas unidecode ipywidgets
from google.colab import output
output.enable_custom_widget_manager()



# 📚 Step 2: Import libraries and setup
We import pandas, regex, unidecode, ipywidgets, and other helpers for cleaning company names.


In [11]:
import re
import pandas as pd
from unidecode import unidecode
import ipywidgets as widgets
from IPython.display import display, clear_output

# 🧹 Step 3: Define helper functions
Functions for text normalization, suffix removal, acronym generation, and deduplication.


In [12]:
STOPWORDS = {
    "the","of","and","for","to","a","an","in","on","at","by","with","from","into",
    "center","centers","foundation","society","group","project","partnership",
    "ministries","chamber","club"
}

TRAILING_CORP_SUFFIXES = (
    "inc","inc.","incorporated","llc","l.l.c.","ltd","ltd.","limited",
    "corp","corp.","corporation","co","co.","company","plc","gmbh",
)

def strip_trailing_corp_suffixes(name: str) -> str:
    s = name.strip()
    changed = True
    while changed:
        changed = False
        pattern = r"[,\.\s]*\b(?:{})\b\.?\s*$".format("|".join([re.escape(x) for x in TRAILING_CORP_SUFFIXES]))
        new_s = re.sub(pattern, "", s, flags=re.IGNORECASE)
        if new_s != s:
            s = new_s.strip()
            changed = True
    return s

def token_case_keep(token: str) -> str:
    if any(c.islower() for c in token) and any(c.isupper() for c in token):
        return token
    return token.capitalize()

def is_acronym_word(token: str) -> bool:
    return bool(re.fullmatch(r"[A-Z]{2,}s?", token))

def compress_acronym(token: str) -> str:
    return re.sub(r"s$", "", token)

def normalize_input(name: str) -> str:
    name = unidecode(name)
    name = name.replace("&", " and ")
    name = re.sub(r"[’']", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

def generate_short_name(original: str, style: str = "friendly", keep_full: int = 2, max_len=None) -> str:
    preserved_caps = re.findall(r"\b[A-Z]{2,}s?\b", original)

    name = normalize_input(original)
    name = strip_trailing_corp_suffixes(name)

    tokens_orig = re.findall(r"[A-Za-z0-9]+", name)
    tokens_lower = [t.lower() for t in tokens_orig]

    sig = [(o, l) for o, l in zip(tokens_orig, tokens_lower) if l not in STOPWORDS]
    if not sig:
        sig = [(o, l) for o, l in zip(tokens_orig, tokens_lower)]

    if style == "acronym":
        out = "".join(o[0].upper() for o, _ in sig if o)
    elif style == "initials":
        out = "".join(o[0].upper() for o, _ in sig if o)
    else:  # friendly
        pieces = []
        full_count = 0
        for o, _ in sig:
            if is_acronym_word(o):
                pieces.append(compress_acronym(o))
            elif full_count < keep_full:
                pieces.append(token_case_keep(o))
                full_count += 1
            else:
                pieces.append(o[0].upper())
        out = "".join(pieces)

    for caps in preserved_caps:
        comp = compress_acronym(caps)
        if comp and comp not in out:
            out += comp

    if len(sig) == 1 and style != "acronym":
        o = sig[0][0]
        if not is_acronym_word(o):
            out = token_case_keep(o)

    if max_len and len(out) > max_len:
        out = out[:max_len]

    return out

def make_unique(shorts):
    counts = {}
    result = []
    for s in shorts:
        if s in counts:
            counts[s] += 1
            new_s = f"{s}_{counts[s]}"
        else:
            counts[s] = 1
            new_s = s
        result.append(new_s)
    return result


# 📝 Step 4: Create input box for company names
This cell builds a text box, style options, and a button to generate short names interactively.

In [13]:
# Input area
text_area = widgets.Textarea(
    value="Paste company names here, one per line...",
    layout=widgets.Layout(width="800px", height="220px")
)

style_dropdown = widgets.Dropdown(
    options=["friendly", "acronym", "initials"],
    value="friendly",
    description="Style:"
)

keep_full_int = widgets.IntSlider(
    value=2, min=0, max=4, step=1,
    description='Keep full words:'
)

maxlen_int = widgets.IntText(
    value=20, description='Max len (0=None):'
)

dedupe_checkbox = widgets.Checkbox(
    value=True, description='Auto-dedupe duplicates'
)

generate_btn = widgets.Button(description="Generate Short Names", button_style="success")
output = widgets.Output()

def on_generate_click(b):
    with output:
        clear_output()
        raw_text = text_area.value.strip()
        if not raw_text:
            print("⚠️ Please enter company names above.")
            return
        companies = [line.strip() for line in raw_text.splitlines() if line.strip()]
        style = style_dropdown.value
        keep_full = keep_full_int.value
        max_len = maxlen_int.value if maxlen_int.value and maxlen_int.value > 0 else None
        shorts = [generate_short_name(c, style=style, keep_full=keep_full, max_len=max_len) for c in companies]
        if dedupe_checkbox.value:
            shorts = make_unique(shorts)
        df = pd.DataFrame({"Original Name": companies, "Short Name": shorts})
        display(df)

generate_btn.on_click(on_generate_click)

# Show UI
display(text_area, style_dropdown, keep_full_int, maxlen_int, dedupe_checkbox, generate_btn, output)


Textarea(value='Paste company names here, one per line...', layout=Layout(height='220px', width='800px'))

Dropdown(description='Style:', options=('friendly', 'acronym', 'initials'), value='friendly')

IntSlider(value=2, description='Keep full words:', max=4)

IntText(value=20, description='Max len (0=None):')

Checkbox(value=True, description='Auto-dedupe duplicates')

Button(button_style='success', description='Generate Short Names', style=ButtonStyle())

Output()