# 🛡️ Insurance Company Classifier
This notebook builds a fast, TF-IDF-based classification pipeline for mapping companies to relevant insurance taxonomy labels based on their descriptions and metadata.

In [None]:
# --- Imports ---
import pandas as pd
import re
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 📥 Load Data

In [None]:
# Load company and taxonomy files
companies = pd.read_csv("../data/ml_insurance_challenge.csv")
taxonomy = pd.read_excel("../data/insurance_taxonomy.xlsx")

## 🧹 Preprocessing

In [None]:
# Helper functions to clean and normalize text
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Convert stringified lists to Python lists
def parse_tags(tag_str):
    try:
        tags = ast.literal_eval(tag_str)
        if isinstance(tags, list):
            return tags
    except:
        pass
    return []

# Clean all relevant fields
companies["description_clean"] = companies["description"].apply(clean_text)
companies["business_tags_clean"] = companies["business_tags"].apply(parse_tags).apply(
    lambda tags: " ".join([clean_text(tag) for tag in tags])
)
companies["sector_clean"] = companies["sector"].apply(clean_text)
companies["category_clean"] = companies["category"].apply(clean_text)
companies["niche_clean"] = companies["niche"].apply(clean_text)

# Combine into one text block for feature input
companies["combined_text"] = (
    companies["description_clean"] + " " +
    companies["business_tags_clean"] + " " +
    companies["sector_clean"] + " " +
    companies["category_clean"] + " " +
    companies["niche_clean"]
)

## 🧠 TF-IDF Vectorization and Similarity

In [None]:
# Prepare taxonomy labels
taxonomy_labels = taxonomy["label"].dropna().unique()
taxonomy_labels_clean = [clean_text(label) for label in taxonomy_labels]

# Combine all text inputs (company + labels)
all_text = list(companies["combined_text"]) + taxonomy_labels_clean

# Fit TF-IDF model
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(all_text)

# Separate company and label vectors
company_tfidf = tfidf_matrix[:len(companies)]
label_tfidf = tfidf_matrix[len(companies):]

# Compute cosine similarity between each company and label
similarity_matrix = cosine_similarity(company_tfidf, label_tfidf)

## 🎯 Predict Top Labels

In [None]:
# Get top 3 label indices for each company
top_k = 3
top_labels = similarity_matrix.argsort(axis=1)[:, -top_k:][:, ::-1]  # Top K indices per row

# Map indices to label names
top_label_names = [[taxonomy_labels_clean[i] for i in row] for row in top_labels]

# Add predictions to DataFrame
companies["top_label_1"] = [labels[0] for labels in top_label_names]
companies["top_label_2"] = [labels[1] for labels in top_label_names]
companies["top_label_3"] = [labels[2] for labels in top_label_names]
companies["insurance_label"] = companies["top_label_1"] + ", " + \
                                 companies["top_label_2"] + ", " + \
                                 companies["top_label_3"]

## 💾 Export Annotated List

In [None]:
# Save annotated file with predicted insurance labels
output_df = companies[[
    "description", "business_tags", "sector", "category", "niche", "insurance_label"
]]
output_df.to_csv("../data/annotated_company_list.csv", index=False)