<a href="https://colab.research.google.com/github/Suguna-Yagnamurthy/python-congress-name-mathcing-task/blob/main/Suguna_Yagnamurthy_python_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#cloning and using the data set from github
!git clone https://github.com/kiss-oliver/python-name-matching.git
%cd python-name-matching
!ls -la

Cloning into 'python-name-matching'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 47 (delta 9), reused 40 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (47/47), 492.16 KiB | 7.03 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/python-name-matching
total 20
drwxr-xr-x 4 root root 4096 Oct  6 09:03 .
drwxr-xr-x 1 root root 4096 Oct  6 09:03 ..
drwxr-xr-x 2 root root 4096 Oct  6 09:03 data
drwxr-xr-x 8 root root 4096 Oct  6 09:03 .git
-rw-r--r-- 1 root root 2526 Oct  6 09:03 README.md


In [4]:
%%writefile name_matcher.py
#importing different modules used in this task
import os
import re
import glob
import sys
import pandas as pnd
import difflib #Fuzzy search
FUZZY_SCORE_THRESHOLD = 90 #high threshold for a tighter match and minimising false matches

#Creating a safe string output instead of Nan/None
def safe_str(x):
    if pnd.isna(x):
        return ""
    return str(x)

#not processing empty values
def normalize_name(raw):
    simp = safe_str(raw).strip()
    if simp == "":
        return ""

    # Normalize spaces and keep commas to differentiate names as first and last
    simp = simp.replace("\u00A0", " ")
    simp = re.sub(r"\s+", " ", simp).strip()

    # Flip "Last name, First name" if a comma exists
    if "," in simp:
        parts = [p.strip() for p in simp.split(",")]
        if len(parts) >= 2:
            last = parts[0]
            rest = " ".join(parts[1:])
            simp = (rest + " " + last).strip()
        else:
            simp = simp.replace(",", " ")

    # Remove simple punctuation
    simp = simp.replace(".", " ")
    simp = simp.replace("-", " ")
    simp = simp.replace("'", "")
    simp = re.sub(r"[^\w\s]", " ", simp)
    simp = re.sub(r"\s+", " ", simp).strip()


    # Tokenize and drop common suffixes
    suffixes = {"jr", "sr", "ii", "iii", "iv", "v"}
    tokens = [tok.lower() for tok in simp.split(" ") if tok.strip() != ""]
    tokens_wo_suffix = [tok for tok in tokens if tok not in suffixes]
    return " ".join(tokens_wo_suffix)

#removing middle names and if only one work(token) then (token,token)
def first_last(norm):
    if norm == "":
        return "", ""
    parts = norm.split()
    if len(parts) == 1:
        return parts[0], parts[0]
    return parts[0], parts[-1]

#comparing the names and sorting alphabetically
def token_sort_similarity(a, b):
    tok_a = " ".join(sorted(a.split()))
    tok_b = " ".join(sorted(b.split()))
    res = difflib.SequenceMatcher(None, tok_a, tok_b).ratio()
    return int(round(res * 100))

#fuzzy similarity value between 0 to 100
def simple_ratio(a, b):
    res = difflib.SequenceMatcher(None, a, b).ratio()
    return int(round(res * 100))

#chosing best match
def best_similarity(a, b):
    return max(simple_ratio(a, b), token_sort_similarity(a, b))

#finding name,combining first and last name and avoiding non-name strings
def detect_name_from_row(row):
    # Finding if the CSV have a column for the names
    for col in row.index:
        cl = str(col).lower()
        if "name" in cl:
            val = safe_str(row[col]).strip()
            if val != "":
                return val

    # Handling the columns for last name whcih can be named in different ways.
    first_candidates = {"first_name", "name"}
    last_candidates  = {"last_name"}
    fval, lval = "", ""
    for col in row.index:
        cl = str(col).lower()
        val = safe_str(row[col]).strip()
        if cl in first_candidates and val != "":
            fval = val
        if cl in last_candidates and val != "":
            lval = val
    if fval != "" and lval != "":
        return (fval + " " + lval).strip()

    # Fallback if values does not look like a name or if empty.
    for col in row.index:
        val = safe_str(row[col]).strip()
        if val != "" and (" " in val) and (len(val.split()) <= 5):
            return val
    return ""


#function that reads the primary file i.e. Congress members
def read_primary(primary_path):
    if not os.path.isfile(primary_path):
        raise FileNotFoundError("Primary dataset not found: " + primary_path)
    df = pnd.read_csv(primary_path, dtype=str, keep_default_na=False)
    if df.shape[0] == 0:
        raise ValueError("Primary dataset is empty: " + primary_path)
    return df

def read_elections(data_dir):
    """
    Read all 'congressional_elections_*.csv' files.
    Skip empty/malformed files. Return list of candidates:
      {name_raw, name_norm, year, file}
    """
    out = []
    files = sorted(glob.glob(os.path.join(data_dir, "congressional_elections_*.csv")))
    for path in files:
        base = os.path.basename(path)
        # Extract year if present
        m = re.search(r"(\d{4})", base)
        year = m.group(1) if m else ""

        #Check and skip empty files
        try:
            if os.path.getsize(path) == 0:
                continue
            df = pnd.read_csv(path, dtype=str, keep_default_na=False)
        except Exception:
            continue

        #Skip empty CSV files
        if df.shape[0] == 0:
            continue

        #For each row normalize name , and save it with its year+file.
        for _, row in df.iterrows():
            raw = detect_name_from_row(row)
            if raw.strip() == "":
                continue
            norm = normalize_name(raw)
            if norm == "":
                continue
            out.append({"name_raw": raw, "name_norm": norm, "year": year, "file": base})
    return out

def build_indexes(candidates):
    """
    Build:
      - exact_set: all normalized names for quick exact lookups
      - by_last: {last_name -> list of candidate dicts}
    """
    exact_set = set()
    by_last = {}
    for cds in candidates:
        norm = cds["name_norm"]
        exact_set.add(norm)
        _, last = first_last(norm)
        if last not in by_last:
            by_last[last] = []
        by_last[last].append(cds)
    return exact_set, by_last

def fuzzy_match(member_norm, by_last):
    """
    Very simple fuzzy step:
      - Compare only against candidates with the SAME LAST NAME
      - Use best of difflib ratios
      - Accept only if score >= FUZZY_SCORE_THRESHOLD
    """
    #Skip if no matched last name exsits
    if member_norm == "":
        return None, 0
    fir, las = first_last(member_norm)
    if las not in by_last:
        return None, 0

    best = None
    best_score = 0
    for cand in by_last[las]:
        score = best_similarity(member_norm, cand["name_norm"])
        if score > best_score:
            best_score = score
            best = cand

    if best is not None and best_score >= FUZZY_SCORE_THRESHOLD:
        return best, best_score
    return None, 0

#main function

def main():

    data_dir = "data"
    primary_file = "congress_members_with_parties.csv"
    output_file = "results_name_matching.csv"

    # Usage: python name_matcher.py [data_dir] [primary_file] [output_file]
    if len(sys.argv) >= 2:
        data_dir = sys.argv[1]
    if len(sys.argv) >= 3:
        primary_file = sys.argv[2]
    if len(sys.argv) >= 4:
        output_file = sys.argv[3]

    primary_path = os.path.join(data_dir, primary_file)

    # Read data
    members = read_primary(primary_path)
    candidates = read_elections(data_dir)
    exact_set, by_last = build_indexes(candidates)

    # Match
    rows = []
    for _, row in members.iterrows():
        raw = detect_name_from_row(row)
        raw = raw if raw != "" else safe_str(row.get("name", ""))
        norm = normalize_name(raw)
        disp = " ".join([t.capitalize() for t in norm.split()]) if norm != "" else raw
        party = safe_str(row.get("party", ""))

        match_type = "none"
        confidence = 0
        m_raw = ""
        m_norm = ""
        m_disp = ""
        m_year = ""
        m_file = ""
        notes = ""

        # Exact first
        if norm != "" and norm in exact_set:
            # Find one matching candidate for year/file info
            _, las = first_last(norm)
            bucket = by_last.get(las, [])
            found = None
            for cand in bucket:
                if cand["name_norm"] == norm:
                    found = cand
                    break
            if found is None:
                for cand in candidates:
                    if cand["name_norm"] == norm:
                        found = cand
                        break
            if found is not None:
                match_type = "exact"
                confidence = 100
                m_raw = found["name_raw"]
                m_norm = found["name_norm"]
                m_disp = " ".join([t.capitalize() for t in m_norm.split()])
                m_year = found["year"]
                m_file = found["file"]
        else:
            # Simple fuzzy
            best, score = fuzzy_match(norm, by_last)
            if best is not None:
                match_type = "fuzzy"
                confidence = int(score)
                m_raw = best["name_raw"]
                m_norm = best["name_norm"]
                m_disp = " ".join([t.capitalize() for t in m_norm.split()])
                m_year = best["year"]
                m_file = best["file"]
            else:
                notes = "No high-confidence match"

        rows.append({
            "member_name_original": raw,
            "member_name_normalized": norm,
            "member_name_display": disp,
            "party": party,
            "matched": "yes" if match_type in ("exact", "fuzzy") else "no",
            "match_type": match_type,
            "confidence": int(confidence),
            "matched_candidate_name_original": m_raw,
            "matched_candidate_name_normalized": m_norm,
            "matched_candidate_name_display": m_disp,
            "election_year": safe_str(m_year),
            "election_file": safe_str(m_file),
            "notes": notes
        })

#final output
    out = pnd.DataFrame(rows).fillna("")
    order = {"exact": 0, "fuzzy": 1, "none": 2}
    out["rank"] = out["match_type"].map(lambda x: order.get(x, 3))
    out.sort_values(by=["rank", "confidence", "member_name_display"], ascending=[True, False, True], inplace=True)
    out.drop(columns=["rank"], inplace=True)
    out.to_csv(output_file, index=False)

#summary output for quick overview of task
    total = out.shape[0]
    matched = int((out["matched"] == "yes").sum())
    exact = int((out["match_type"] == "exact").sum())
    fuzzy = int((out["match_type"] == "fuzzy").sum())
    print("Done.")
    print("Total members processed:", total)
    print("Matched:", matched, "(Exact:", exact, ", Fuzzy:", fuzzy, ")")
    print("Unmatched:", total - matched)
    print("Output file:", output_file)

if __name__ == "__main__":
    main()



Overwriting name_matcher.py


In [5]:
#total output
!python name_matcher.py

Done.
Total members processed: 2873
Matched: 1662 (Exact: 1098 , Fuzzy: 564 )
Unmatched: 1211
Output file: results_name_matching.csv
