In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/raw-agent-data/all_agents_combined.csv
/kaggle/input/cleaned/df_clean.csv


In [3]:
import pandas as pd
import numpy as np
import json
import ast
import re
import math
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

SEED = 42

# Show full content of each column
pd.set_option("display.max_colwidth", None)

# Show all columns (no "..." truncation)
pd.set_option("display.max_columns", None)

In [3]:
df= pd.read_csv("/kaggle/input/raw-agent-data/all_agents_combined.csv")

In [4]:
df.shape

(356, 47)

In [5]:
df.isnull().sum()

state                          0
agentId                        0
partner                        0
name                           0
givenName                      0
email                          0
slug                           0
photoUrl                       3
phoneNumber                  190
adminOnlyPhoneNumber           0
servesOffers                   0
servesListings                 0
starRating                    88
numReviews                    84
pastYearDealsInRegion          0
pastYearDeals                 89
veteran                        0
isPremier                      0
languages                      0
complianceText               321
homeTransactionsLifetime     202
transactionVolumeLifetime    200
officeState                   74
jobTitle                       0
primaryServiceRegions          0
propertyTypes                  0
dealPrices                     0
agentType                      0
businessMarket                 0
businessMarketId               0
servicing 

In [6]:
# ================================
# PREPROCESS → df_pre  →  de_clean
# ================================
import pandas as pd, numpy as np, ast, json, re, math

# ------- helpers -------
def _parse_jsonish_lists(x):
    """Parse JSON/py-list-looking strings to Python lists safely."""
    if isinstance(x, list):
        return x
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    s = str(x).strip()
    if not s or s.lower() in {"nan","none","null","[]"}:
        return []
    # try literal_eval first (handles single quotes)
    try:
        v = ast.literal_eval(s)
        return v if isinstance(v, list) else [v]
    except Exception:
        pass
    # try JSON after minor cleanup
    try:
        s2 = re.sub(r",\s*([}\]])", r"\1", s.replace("'", '"'))
        v = json.loads(s2)
        return v if isinstance(v, list) else [v]
    except Exception:
        return [s]

def _ensure_list(v):
    if isinstance(v, list): return v
    if v is None or (isinstance(v, float) and np.isnan(v)): return []
    return [v] if not isinstance(v, str) else ([v.strip()] if v.strip() else [])

def _clean_prices(seq):
    """Filter to sane real-estate prices; drop junk/outliers."""
    if not isinstance(seq, list): return []
    out = []
    for val in seq:
        try:
            p = float(val)
            if 1_000 <= p <= 50_000_000 and math.isfinite(p):
                out.append(p)
        except Exception:
            continue
    return out

def preprocess_agents(df: pd.DataFrame) -> pd.DataFrame:
    """Robust preprocessing that prepares all features needed for baseline ranking."""
    df_pre = df.copy()

    # --- list-like columns ---
    for c in ["primaryServiceRegions","propertyTypes","dealPrices","languages"]:
        if c in df_pre.columns:
            df_pre[c] = df_pre[c].apply(_parse_jsonish_lists).apply(_ensure_list)
        else:
            df_pre[c] = [[]]*len(df_pre)

    # --- numeric typing ---
    for c in [
        "starRating","numReviews","pastYearDeals","pastYearDealsInRegion",
        "homeTransactionsLifetime","transactionVolumeLifetime"
    ]:
        if c in df_pre.columns:
            df_pre[c] = pd.to_numeric(df_pre[c], errors="coerce")
        else:
            df_pre[c] = np.nan

    # --- deal price stats ---
    prices = df_pre["dealPrices"].apply(_clean_prices)
    df_pre["dealPrices_count"]  = prices.apply(len)
    df_pre["dealPrices_median"] = prices.apply(lambda x: float(np.median(x)) if x else 0.0)
    df_pre["dealPrices_q25"]    = prices.apply(lambda x: float(np.percentile(x,25)) if x else 0.0)
    df_pre["dealPrices_q75"]    = prices.apply(lambda x: float(np.percentile(x,75)) if x else 0.0)
    df_pre["dealPrices_min"]    = prices.apply(lambda x: float(np.min(x))     if x else 0.0)
    df_pre["dealPrices_max"]    = prices.apply(lambda x: float(np.max(x))     if x else 0.0)
    df_pre["dealPrices_std"]    = prices.apply(lambda x: float(np.std(x))     if len(x)>1 else 0.0)

    # dispersion / span
    df_pre["price_range_span"] = df_pre["dealPrices_max"] - df_pre["dealPrices_min"]
    df_pre["price_coefficient_variation"] = np.where(
        df_pre["dealPrices_median"]>0,
        df_pre["dealPrices_std"]/df_pre["dealPrices_median"],
        0
    )

    # --- boolean normalization ---
    def _to_bool(s):
        return pd.Series(s).map({True:True, False:False, "True":True, "False":False, 1:True, 0:False, "1":True, "0":False}) \
                           .fillna(False).astype(bool)
    for c in ["partner","isPremier","servesOffers","servesListings","isActive","profileContactEnabled","veteran"]:
        if c in df_pre.columns:
            df_pre[c] = _to_bool(df_pre[c])
        else:
            df_pre[c] = False

    # --- counts & specialization ---
    df_pre["num_service_regions"]  = df_pre["primaryServiceRegions"].apply(len)
    df_pre["num_property_types"]   = df_pre["propertyTypes"].apply(len)
    df_pre["market_breadth_score"] = df_pre["num_service_regions"] * df_pre["num_property_types"]
    df_pre["specialization_index"] = np.where(df_pre["num_property_types"]>0, 1/df_pre["num_property_types"], 0)

    # --- fill basic text fields ---
    if "officeState" not in df_pre.columns: df_pre["officeState"] = np.nan
    if "brokerageName" not in df_pre.columns: df_pre["brokerageName"] = np.nan
    if "phoneNumber" not in df_pre.columns: df_pre["phoneNumber"] = np.nan
    df_pre["officeState"]   = df_pre["officeState"].fillna("Unknown")
    df_pre["brokerageName"] = df_pre["brokerageName"].fillna("Independent")
    df_pre["phoneNumber"]   = df_pre["phoneNumber"].fillna("Not Available")

    # --- core NaN fills ---
    df_pre["starRating"]    = df_pre["starRating"].fillna(0.0)
    df_pre["numReviews"]    = df_pre["numReviews"].fillna(0).astype(int)
    df_pre["pastYearDeals"] = df_pre["pastYearDeals"].fillna(0).astype(int)

    # --- impute lifetime counts/volume using price history when missing ---
    df_pre["homeTransactionsLifetime"] = df_pre["homeTransactionsLifetime"].fillna(df_pre["dealPrices_count"])
    calc_volume = prices.apply(lambda x: float(sum(x)) if x else 0.0)
    df_pre["transactionVolumeLifetime"] = df_pre["transactionVolumeLifetime"].fillna(calc_volume).fillna(0.0)

    # --- derived metrics used by the ranker ---
    denom = df_pre["homeTransactionsLifetime"].replace(0, np.nan)
    df_pre["avg_transaction_value"] = (df_pre["transactionVolumeLifetime"]/denom).fillna(0.0)
    df_pre["experience_score"]      = np.log1p(df_pre["homeTransactionsLifetime"])
    df_pre["recent_activity_ratio"] = np.where(df_pre["homeTransactionsLifetime"]>0,
                                               df_pre["pastYearDeals"]/df_pre["homeTransactionsLifetime"], 0.0)
    df_pre["weighted_rating"]       = df_pre["starRating"] * np.log1p(df_pre["numReviews"])
    df_pre["price_tier_percentile"] = (
        df_pre["dealPrices_median"].rank(pct=True) if df_pre["dealPrices_median"].max() > 0 else pd.Series(0.5, index=df_pre.index)
    )

    # Optional: drop the raw dealPrices list to save memory (we keep stats)
    # df_pre = df_pre.drop(columns=["dealPrices"], errors="ignore")

    return df_pre

# -------- run preprocessing on your raw df --------
# Assumes `df` already exists in memory (e.g., from pd.read_csv(...))
df_pre = preprocess_agents(df)

# -----------------------------------------
# FINAL TRIM: keep only columns needed for
# display + baseline ranking → de_clean
# -----------------------------------------
keep_cols = [
    # identifiers & display
    "agentId","slug","name","brokerageName","officeState","email","phoneNumber",
    "profileUrl","photoUrl",

    # core quality/performance
    "starRating","numReviews","pastYearDeals",
    "homeTransactionsLifetime","transactionVolumeLifetime","avg_transaction_value",
    "experience_score","recent_activity_ratio","weighted_rating",

    # price stats we use for price-fit & summaries
    "dealPrices_count","dealPrices_median","dealPrices_q25","dealPrices_q75",
    "dealPrices_min","dealPrices_max","dealPrices_std","price_range_span",
    "price_coefficient_variation","price_tier_percentile",

    # geo / specialization
    "primaryServiceRegions","num_service_regions",
    "propertyTypes","num_property_types",
    "market_breadth_score","specialization_index",

    # status flags
    "partner","isPremier","isActive","profileContactEnabled","servesOffers","servesListings",
]

# intersect with existing columns to be safe
keep_cols = [c for c in keep_cols if c in df_pre.columns]

# build the final, trimmed dataframe
df_clean = df_pre[keep_cols].copy()

# (optional) dedupe by agentId while keeping the best row (e.g., highest pastYearDeals)
if "agentId" in df_clean.columns:
    df_clean = (df_clean.sort_values(["agentId","pastYearDeals"], ascending=[True,False])
                        .drop_duplicates(subset=["agentId"], keep="first"))

print("✅ Preprocessing done.")
print("df_pre shape :", df_pre.shape)
print("df_clean shape:", df_clean.shape)


✅ Preprocessing done.
df_pre shape : (356, 65)
df_clean shape: (355, 40)


In [7]:
df_clean.isnull().sum()

agentId                        0
slug                           0
name                           0
brokerageName                  0
officeState                    0
email                          0
phoneNumber                    0
profileUrl                     0
photoUrl                       3
starRating                     0
numReviews                     0
pastYearDeals                  0
homeTransactionsLifetime       0
transactionVolumeLifetime      0
avg_transaction_value          0
experience_score               0
recent_activity_ratio          0
weighted_rating                0
dealPrices_count               0
dealPrices_median              0
dealPrices_q25                 0
dealPrices_q75                 0
dealPrices_min                 0
dealPrices_max                 0
dealPrices_std                 0
price_range_span               0
price_coefficient_variation    0
price_tier_percentile          0
primaryServiceRegions          0
num_service_regions            0
propertyTy

In [8]:

class AgentScorer:
    """
    Baseline, transparent scoring used in marketplaces:
    - Performance track record
    - Market expertise (geo/property/price)
    - Client satisfaction (ratings + review volume)
    - Professional standing (premier/active/team/brokerage)
    - Availability & responsiveness
    """

    def __init__(self):
        self.component_weights = {
            'performance_track_record':      0.35,
            'market_expertise':              0.25,
            'client_satisfaction':           0.20,
            'professional_standing':         0.12,
            'availability_responsiveness':   0.08
        }
        self.performance_subweights = {
            'recent_deals':     0.40,
            'total_experience': 0.25,
            'transaction_value':0.20,
            'consistency':      0.15
        }
        self.expertise_subweights = {
            'geographic_coverage':     0.35,
            'property_specialization': 0.30,
            'price_range_expertise':   0.35
        }
        self.satisfaction_subweights = {'rating_score':0.60,'review_volume':0.40}
        print("🎯 Scorer initialized with weights:", self.component_weights)

    # ---------- Performance (35%)
    def calculate_performance_score(self, df: pd.DataFrame) -> pd.Series:
        max_recent  = max(df['pastYearDeals'].max(), 1)
        max_volume  = max(df['transactionVolumeLifetime'].max(), 1)
        recent      = df['pastYearDeals'] / max_recent
        volume      = df['transactionVolumeLifetime'] / max_volume
        exp_norm    = (df['experience_score'] / df['experience_score'].max()) if df['experience_score'].max() > 0 else pd.Series(0, index=df.index)
        consistency = df['recent_activity_ratio'].clip(0, 1)

        return (
            self.performance_subweights['recent_deals']     * recent +
            self.performance_subweights['total_experience'] * exp_norm +
            self.performance_subweights['transaction_value']* volume +
            self.performance_subweights['consistency']      * consistency
        )

    # ---------- Market expertise (25%)
    def calculate_market_expertise_score(
        self, df: pd.DataFrame,
        target_regions: Optional[List[str]] = None,
        target_property_types: Optional[List[str]] = None,
        target_price_range: Optional[Tuple[float,float]] = None
    ) -> pd.Series:

        # Geo: partial string overlap matching; swap to Jaccard if you want strict set IoU. :contentReference[oaicite:3]{index=3}
        if target_regions:
            geo = df['primaryServiceRegions'].apply(lambda regs: self._region_overlap(regs, target_regions))
        else:
            geo = (df['num_service_regions'] / max(df['num_service_regions'].max(),1)).clip(0,1)

        # Property types
        if target_property_types:
            prop = df['propertyTypes'].apply(lambda types: self._property_overlap(types, target_property_types))
        else:
            # favor balanced specialization (middle of range)
            prop = 1 - (df['specialization_index'] - 0.5).abs()

        # Price expertise
        price = self._price_expertise(df, target_price_range) if target_price_range else (1 - df['price_coefficient_variation'].clip(0,1))

        return (
            self.expertise_subweights['geographic_coverage']     * geo +
            self.expertise_subweights['property_specialization'] * prop +
            self.expertise_subweights['price_range_expertise']   * price
        )

    # ---------- Client satisfaction (20%)
    def calculate_client_satisfaction_score(self, df: pd.DataFrame) -> pd.Series:
        rating = (df['starRating']/5.0).clip(0,1)
        max_rev = max(df['numReviews'].max(),1)
        vol    = np.sqrt(df['numReviews'] / max_rev)  # diminishing returns
        return self.satisfaction_subweights['rating_score']*rating + self.satisfaction_subweights['review_volume']*vol

    # ---------- Professional standing (12%)
    def calculate_professional_standing_score(self, df: pd.DataFrame) -> pd.Series:
        premier   = df['isPremier'].astype(float)
        active    = df['isActive'].astype(float)
        team      = df['partner'].astype(float)*0.5
        brokerage = (df['brokerageName']!='Independent').astype(float)*0.3
        score = 0.4*premier + 0.3*active + 0.2*team + 0.1*brokerage
        return np.minimum(1.0, score)

    # ---------- Availability (8%)
    def calculate_availability_score(self, df: pd.DataFrame) -> pd.Series:
        phone  = (df['phoneNumber']!='Not Available').astype(float)
        contact= df['profileContactEnabled'].astype(float)
        recent = (df['pastYearDeals']>0).astype(float)
        active = df['isActive'].astype(float)
        return 0.3*phone + 0.2*contact + 0.3*recent + 0.2*active

    # ---------- Total score + breakdown
    def calculate_total_score(
        self, df: pd.DataFrame,
        target_regions: Optional[List[str]]=None,
        target_property_types: Optional[List[str]]=None,
        target_price_range: Optional[Tuple[float,float]]=None
    ) -> Tuple[pd.Series, pd.DataFrame]:

        perf = self.calculate_performance_score(df)
        exp  = self.calculate_market_expertise_score(df, target_regions, target_property_types, target_price_range)
        sat  = self.calculate_client_satisfaction_score(df)
        prof = self.calculate_professional_standing_score(df)
        avail= self.calculate_availability_score(df)

        total = (
            self.component_weights['performance_track_record']    * perf +
            self.component_weights['market_expertise']            * exp  +
            self.component_weights['client_satisfaction']         * sat  +
            self.component_weights['professional_standing']       * prof +
            self.component_weights['availability_responsiveness'] * avail
        )

        breakdown = pd.DataFrame({
            'performance_score':  perf,
            'expertise_score':    exp,
            'satisfaction_score': sat,
            'professional_score': prof,
            'availability_score': avail,
            'total_score':        total
        })
        return total, breakdown

    # ---------- helpers
    def _region_overlap(self, agent_regions: List[str], target_regions: List[str]) -> float:
        if not agent_regions or not target_regions: return 0.0
        a = [r.lower().strip() for r in agent_regions]
        t = [r.lower().strip() for r in target_regions]
        # partial contains match
        hits = sum(1 for trg in t if any(trg in ar or ar in trg for ar in a))
        return min(1.0, hits / len(t))

    def _property_overlap(self, agent_types: List[str], target_types: List[str]) -> float:
        if not agent_types or not target_types: return 0.0
        a = [p.lower().strip() for p in agent_types]
        t = [p.lower().strip() for p in target_types]
        hits = sum(1 for trg in t if any(trg in ap or ap in trg for ap in a))
        return min(1.0, hits / len(t))

    def _price_expertise(self, df: pd.DataFrame, price_range: Tuple[float,float]) -> pd.Series:
        lo, hi   = price_range
        center   = (lo + hi) / 2
        span     = max(hi - lo, 1.0)
        def score_row(r):
            mn, mx, md = r['dealPrices_min'], r['dealPrices_max'], r['dealPrices_median']
            if md == 0: return 0.2
            if (mn <= lo) and (mx >= hi): return 1.0
            if lo <= md <= hi: return 0.9
            dist = abs(md - center) / (2*span)
            return max(0.1, 1 - dist)
        return df.apply(score_row, axis=1)


In [10]:

class AgentRecommendationEngine:
    def __init__(self, df: pd.DataFrame, scorer: AgentScorer):
        self.df = df
        self.scorer = scorer

    def _matches_regions(self, regions, targets):
        if not isinstance(regions, list): return False
        a = [r.lower().strip() for r in regions]
        t = [r.lower().strip() for r in targets]
        return any(any(trg in ar or ar in trg for ar in a) for trg in t)

    def _matches_property_types(self, types, targets):
        if not isinstance(types, list): return False
        a = [p.lower().strip() for p in types]
        t = [p.lower().strip() for p in targets]
        return any(any(trg in ap or ap in trg for ap in a) for trg in t)

    def filter_agents(
        self,
        target_regions: Optional[List[str]]=None,
        target_property_types: Optional[List[str]]=None,
        target_price_range: Optional[Tuple[float,float]]=None,
        min_rating: float = 0.0,
        min_reviews: int = 0,
        require_phone: bool = False,
        only_active: bool = True
    ) -> pd.DataFrame:

        df = self.df.copy()
        start = len(df)
        if only_active:
            df = df[df['isActive'] == True]
        if target_regions:
            df = df[df['primaryServiceRegions'].apply(lambda r: self._matches_regions(r, target_regions))]
        if target_property_types:
            df = df[df['propertyTypes'].apply(lambda t: self._matches_property_types(t, target_property_types))]
        if target_price_range:
            lo, hi = target_price_range
            mask = (
                (df['dealPrices_count'] > 0) &
                (
                    ((df['dealPrices_min'] <= hi) & (df['dealPrices_max'] >= lo)) |
                    ((df['dealPrices_median'] >= lo*0.5) & (df['dealPrices_median'] <= hi*2.0))
                )
            )
            df = df[mask]
        if min_rating > 0:
            df = df[df['starRating'] >= min_rating]
        if min_reviews > 0:
            df = df[df['numReviews'] >= min_reviews]
        if require_phone:
            df = df[df['phoneNumber'] != 'Not Available']
        print(f"🔍 Filtered {start:,} → {len(df):,}")
        return df

    def _create_agent_recommendation(
        self, row, rank, target_regions, target_property_types, target_price_range
    ):
        reasons = []
        if row['isPremier']: reasons.append("Premier agent")
        if row['pastYearDeals'] >= self.df['pastYearDeals'].quantile(0.8): reasons.append("High recent deal volume")
        if row['starRating'] >= 4.8 and row['numReviews'] >= 10: reasons.append("Excellent ratings with credible volume")
        if row['num_service_regions'] >= 5: reasons.append("Broad geographic coverage")

        # overlaps
        if target_regions:
            reasons.append(f"Regions matched: {', '.join(target_regions)}")
        if target_property_types:
            reasons.append(f"Property types matched: {', '.join(target_property_types)}")
        if target_price_range:
            reasons.append(f"Price fit around ${target_price_range[0]:,.0f}–${target_price_range[1]:,.0f}")

        return {
            "rank": rank,
            "agent_id": int(row['agentId']),
            "name": row['name'],
            "brokerage": row.get('brokerageName', 'Independent'),
            "overall_score": round(float(row['total_score']), 4),
            "star_rating": float(row['starRating']),
            "num_reviews": int(row['numReviews']),
            "past_year_deals": int(row['pastYearDeals']),
            "median_price": float(row['dealPrices_median']),
            "service_regions": row['primaryServiceRegions'],
            "property_types": row['propertyTypes'],
            "phone": row.get('phoneNumber', 'Not Available'),
            "email": row.get('email', ''),
            "profile": row.get('profileUrl', ''),
            "scores": {
                "performance":  round(float(row['performance_score']), 4),
                "expertise":    round(float(row['expertise_score']), 4),
                "satisfaction": round(float(row['satisfaction_score']), 4),
                "professional": round(float(row['professional_score']), 4),
                "availability": round(float(row['availability_score']), 4),
            },
            "why_recommended": reasons
        }

    def recommend_agents(
        self,
        target_regions: Optional[List[str]]=None,
        target_property_types: Optional[List[str]]=None,
        target_price_range: Optional[Tuple[float,float]]=None,
        top_k: int = 10,
        min_rating: float = 0.0,
        min_reviews: int = 0,
        require_phone: bool = False
    ) -> Dict[str, Any]:

        filt = self.filter_agents(
            target_regions, target_property_types, target_price_range,
            min_rating, min_reviews, require_phone, only_active=True
        )
        if len(filt) == 0:
            return {
                "query": {"regions":target_regions,"property_types":target_property_types,"price_range":target_price_range,"top_k":top_k},
                "total_matches": 0,
                "recommendations": [],
                "message": "No agents matched. Try relaxing filters."
            }

        total, breakdown = self.scorer.calculate_total_score(
            filt, target_regions, target_property_types, target_price_range
        )

        out = filt.reset_index(drop=True).copy()
        out = pd.concat([out, breakdown.reset_index(drop=True)], axis=1)
        out = out.sort_values("total_score", ascending=False).head(top_k)

        recs = []
        for i, (_, row) in enumerate(out.iterrows(), 1):
            recs.append(self._create_agent_recommendation(row, i, target_regions, target_property_types, target_price_range))

        return {
            "query": {"regions":target_regions,"property_types":target_property_types,"price_range":target_price_range,"top_k":top_k},
            "total_matches": int(len(filt)),
            "recommendations": recs,
            "scoring_methodology": self.scorer.component_weights
        }

In [11]:
scorer = AgentScorer()
engine = AgentRecommendationEngine(df_clean, scorer)

🎯 Scorer initialized with weights: {'performance_track_record': 0.35, 'market_expertise': 0.25, 'client_satisfaction': 0.2, 'professional_standing': 0.12, 'availability_responsiveness': 0.08}


In [12]:

TARGET_REGIONS       = ["Fort Myers","Naples"]        # county/cities/areas
TARGET_PROPERTYTYPES = ["Single Family Residential","Condo/Co-op"]
TARGET_PRICE_RANGE   = (300_000, 600_000)             # (min, max)
TOP_K                = 10

results = engine.recommend_agents(
    target_regions=TARGET_REGIONS,
    target_property_types=TARGET_PROPERTYTYPES,
    target_price_range=TARGET_PRICE_RANGE,
    top_k=TOP_K,
    min_rating=0.0,
    min_reviews=0,
    require_phone=False
)

# Show top-5 pretty
pd.DataFrame(results["recommendations"]).head()

# Save artifacts
Path("out").mkdir(exist_ok=True)
pd.DataFrame(results["recommendations"]).to_csv("out/agent_recommendations.csv", index=False)
df_clean.to_csv("out/agents_clean.csv", index=False)
print("✅ Saved: out/agent_recommendations.csv, out/agents_clean.csv")


🔍 Filtered 355 → 4
✅ Saved: out/agent_recommendations.csv, out/agents_clean.csv


In [13]:
pd.DataFrame(results["recommendations"]).head()

Unnamed: 0,rank,agent_id,name,brokerage,overall_score,star_rating,num_reviews,past_year_deals,median_price,service_regions,property_types,phone,email,profile,scores,why_recommended
0,1,24746,David Squires,Redfin,0.8615,4.9,51,19,387500.0,"[N Fort Myers, Lehigh Acres, Fort Myers, Fort Myers Beach, Bonita Springs-Estero, East Fort Myers]","[Single Family Residential, Condo/Co-op, Vacant Land, Mobile/Manufactured Home, Multi-Family (2-4 Unit)]",(239) 341-5997,david.squires@redfin.com,/real-estate-agents/david-squires,"{'performance': 0.8744, 'expertise': 0.825, 'satisfaction': 0.988, 'professional': 0.73, 'availability': 0.8}","[Premier agent, Excellent ratings with credible volume, Broad geographic coverage, Regions matched: Fort Myers, Naples, Property types matched: Single Family Residential, Condo/Co-op, Price fit around $300,000–$600,000]"
1,2,32591,Bruni Team - Partner Team,Southern Luxury Realty LLC,0.7028,4.5,12,12,222000.0,"[Cape Coral, N Fort Myers, Lehigh Acres, Bonita Springs-Estero, Naples, Marco Island, North Naples]","[Single Family Residential, Condo/Co-op, Vacant Land, Other, Mobile/Manufactured Home, Townhouse]",Not Available,partner-team-1715003328@redfintest.com,/real-estate-agents/bruni-team,"{'performance': 0.5667, 'expertise': 1.0, 'satisfaction': 0.734, 'professional': 0.43, 'availability': 0.7}","[Broad geographic coverage, Regions matched: Fort Myers, Naples, Property types matched: Single Family Residential, Condo/Co-op, Price fit around $300,000–$600,000]"
2,3,37611,The Anderson Team - Fort Myers - Partner Team,Sellstate 5 Star Realty,0.5727,5.0,18,8,33500.0,"[Cape Coral, N Fort Myers, Lehigh Acres, Fort Myers, East Fort Myers, Fort Myers Islands]","[Single Family Residential, Vacant Land, Townhouse, Condo/Co-op]",Not Available,partner-team-1742577788@redfintest.com,/real-estate-agents/the-anderson-team-fort-myers,"{'performance': 0.4344, 'expertise': 0.582, 'satisfaction': 0.8376, 'professional': 0.43, 'availability': 0.7}","[Excellent ratings with credible volume, Broad geographic coverage, Regions matched: Fort Myers, Naples, Property types matched: Single Family Residential, Condo/Co-op, Price fit around $300,000–$600,000]"
3,4,46712,Jude Paul,Suncoast Realty Solutions,0.5633,5.0,11,1,260000.0,"[Cape Coral, Lehigh Acres, Fort Myers, East Fort Myers]","[Single Family Residential, Other, Multi-Family (2-4 Unit), Condo/Co-op, Townhouse, Vacant Land]",Not Available,jude.paul@suncoastrs.com,/real-estate-agents/jude-paul,"{'performance': 0.2636, 'expertise': 0.825, 'satisfaction': 0.7858, 'professional': 0.43, 'availability': 0.7}","[Excellent ratings with credible volume, Regions matched: Fort Myers, Naples, Property types matched: Single Family Residential, Condo/Co-op, Price fit around $300,000–$600,000]"


In [14]:
# If you haven't already created `scorer`, do it now:
try:
    scorer
except NameError:
    scorer = AgentScorer()

# ---- set your query (edit these) ----
TARGET_REGIONS       = ["Fort Myers","Naples"]                 # or None
TARGET_PROPERTYTYPES = ["Single Family Residential","Condo/Co-op"]  # or None
TARGET_PRICE_RANGE   = (300_000, 600_000)                      # or None

# ---- compute scores & breakdown on your cleaned table ----
total_score, breakdown = scorer.calculate_total_score(
    df_clean,
    target_regions=TARGET_REGIONS,
    target_property_types=TARGET_PROPERTYTYPES,
    target_price_range=TARGET_PRICE_RANGE
)

# join scores back to agent identity columns
cols_id = [c for c in ["agentId","name","brokerageName","officeState","dealPrices_median","pastYearDeals","starRating","numReviews"] if c in df_clean.columns]
scored = pd.concat([df_clean[cols_id].reset_index(drop=True), breakdown.reset_index(drop=True)], axis=1)

# sort by total score (desc) and show top 20
scored_top = scored.sort_values("total_score", ascending=False).head(20)
scored_top

Unnamed: 0,agentId,name,brokerageName,officeState,dealPrices_median,pastYearDeals,starRating,numReviews,performance_score,expertise_score,satisfaction_score,professional_score,availability_score,total_score
6,1339,Lynn Ikle,Redfin,MD,450000.0,65,4.9,590,0.804851,0.65,0.988,0.73,0.8,0.793398
29,6645,Chelsea Traylor,Redfin,DC,575000.0,45,4.8,511,0.730064,0.65,0.948258,0.73,0.8,0.759274
23,5685,"Mehrnaz ""Mary"" Bazargan",Redfin,DC,557500.0,51,4.8,371,0.705981,0.65,0.893191,0.73,0.8,0.739831
41,9393,Mitch Toland Jr,Independent,MD,320000.0,58,4.9,390,0.693619,0.65,0.913212,0.7,0.8,0.735909
3,1144,Patricia Ammann,Redfin,VA,595000.0,38,4.8,475,0.666687,0.65,0.934906,0.73,0.8,0.734422
77,17547,Jonathan Scheffenacker,Independent,MD,353000.0,63,4.7,283,0.72006,0.65,0.84103,0.7,0.8,0.730727
102,22843,Michelle Palmquist,Redfin,OR,515000.0,66,5.0,164,0.712848,0.65,0.81089,0.73,0.8,0.725775
11,2025,Gus Sanchez,Redfin,OR,440000.0,55,4.9,234,0.64261,0.65,0.839908,0.73,0.8,0.706995
15,3089,Brandon Hoffman,Redfin,MD,410000.0,45,4.8,352,0.61527,0.65,0.884962,0.73,0.8,0.706437
5,1207,Rob Wittman,Redfin,VA,590000.0,50,4.8,286,0.630037,0.65,0.854495,0.73,0.8,0.705512


In [2]:
new_df = pd.read_csv("/kaggle/input/cleaned/df_clean.csv")

In [3]:
new_df.shape

(356, 41)

In [6]:
new_df.head()

Unnamed: 0,agent_id,slug,name,brokerage_name,office_state,email,phone_number,profile_url,photo_url,star_rating,num_reviews,past_year_deals,past_year_deals_in_region,home_transactions_lifetime,transaction_volume_lifetime,avg_transaction_value,primary_service_regions,property_types,num_service_regions,num_property_types,deal_prices_count,deal_prices_median,deal_prices_q25,deal_prices_q75,deal_prices_min,deal_prices_max,deal_prices_std,price_range_span,price_coefficient_variation,price_tier_percentile,market_breadth_score,specialization_index,experience_score,recent_activity_ratio,weighted_rating,partner,is_premier,serves_offers,serves_listings,is_active,profile_contact_enabled
0,24746,david-squires,David Squires,Redfin,FL,david.squires@redfin.com,(239) 341-5997,/real-estate-agents/david-squires,https://ssl.cdn-redfin.com/system_files/images/24746/640x460/6_63.jpg,4.9,51,19,0,117.0,52659458.0,450080.837607,"['N Fort Myers', 'Lehigh Acres', 'Fort Myers', 'Fort Myers Beach', 'Bonita Springs-Estero', 'East Fort Myers']","['Single Family Residential', 'Condo/Co-op', 'Vacant Land', 'Mobile/Manufactured Home', 'Multi-Family (2-4 Unit)']",6,5,86,387500.0,302750.0,531380.0,140000.0,2900000.0,353690.279493,2760000.0,0.912749,0.570225,30,0.2,4.770685,0.162393,19.361094,False,True,True,True,True,False
1,37611,the-anderson-team-fort-myers,The Anderson Team - Fort Myers - Partner Team,Sellstate 5 Star Realty,FL,partner-team-1742577788@redfintest.com,Not Available,/real-estate-agents/the-anderson-team-fort-myers,https://ssl.cdn-redfin.com/system_files/images/37611/500x500/8_58.jpg,5.0,18,8,1,63.0,7623413.0,121006.555556,"['Cape Coral', 'N Fort Myers', 'Lehigh Acres', 'Fort Myers', 'East Fort Myers', 'Fort Myers Islands']","['Single Family Residential', 'Vacant Land', 'Townhouse', 'Condo/Co-op']",6,4,63,33500.0,17950.0,208250.0,4125.0,585000.0,139226.793229,580875.0,4.156024,0.205056,24,0.25,4.158883,0.126984,14.722195,True,False,True,True,True,True
2,21113,katherine-stobb,Kathy Stobb,Eventide Realty Services LLC,FL,kathy@eventiderealty.com,Not Available,/real-estate-agents/katherine-stobb,https://ssl.cdn-redfin.com/v594.0.0/images/serviceProvider/person-500x500.jpg,5.0,36,12,0,125.0,33524525.0,268196.2,"['Cape Coral', 'Lehigh Acres', 'FL - Alva & LaBelle']","['Vacant Land', 'Single Family Residential', 'Condo/Co-op', 'Mobile/Manufactured Home', 'Multi-Family (2-4 Unit)', 'Multi-Family (5+ Unit)', 'Other']",3,7,125,244000.0,152500.0,335000.0,20000.0,2100000.0,207710.996928,2080000.0,0.851275,0.293539,21,0.142857,4.836282,0.096,18.05459,True,False,True,True,True,True
3,46712,jude-paul,Jude Paul,Suncoast Realty Solutions,FL,jude.paul@suncoastrs.com,Not Available,/real-estate-agents/jude-paul,https://ssl.cdn-redfin.com/v594.0.0/images/serviceProvider/person-500x500.jpg,5.0,11,1,0,41.0,11325150.0,276223.170732,"['Cape Coral', 'Lehigh Acres', 'Fort Myers', 'East Fort Myers']","['Single Family Residential', 'Other', 'Multi-Family (2-4 Unit)', 'Condo/Co-op', 'Townhouse', 'Vacant Land']",4,6,41,260000.0,210000.0,340000.0,21500.0,674000.0,108284.2511,652500.0,0.416478,0.338483,24,0.166667,3.73767,0.02439,12.424533,True,False,True,True,True,True
4,61995,knowledge-base-team,Knowledge Base TEAM - Partner Team,Knowledge Base Real Estate,FL,partner-team-1741383035@redfintest.com,Not Available,/real-estate-agents/knowledge-base-team,https://ssl.cdn-redfin.com/system_files/images/61995/500x500/8_34.jpg,4.9,174,1,0,1.0,70000.0,70000.0,"['Cape Coral', 'N Fort Myers', 'Lehigh Acres', 'Fort Myers', 'Fort Myers Beach', 'Bonita Springs-Estero', 'Ave Maria', 'East Fort Myers', 'North Naples']",['Condo/Co-op'],9,1,1,70000.0,70000.0,70000.0,70000.0,70000.0,0.0,0.0,0.0,0.210674,9,1.0,0.693147,1.0,25.307451,True,False,True,True,True,True


In [4]:
new_df.duplicated(subset='agent_id').sum()

1

In [5]:
import pandas as pd

# assume new_df is your dataframe
# also assume the column primary_service_regions contains lists or strings representing regions

def get_unique_service_regions(df):
    # Drop NA/null entries
    series = df['primary_service_regions'].dropna()

    # If entries are lists, else if strings with delimiters
    # First convert all to strings (if mixed types), then attempt to split
    processed = ( series
                  .astype(str)
                  # remove unwanted characters (like [ ] ' " etc.)
                  .str.replace(r'[\[\]\'\"\{\}]', '', regex=True)
                  # split on commas or slashes if many regions in one string
                  .str.split(r'[\,\/]')
                  .explode()
                  .str.strip()
                  .loc[lambda x: x != '']  # remove empty strings
                )

    unique_regions = processed.unique().tolist()
    return unique_regions

# usage:
unique_regions = get_unique_service_regions(new_df)
print("Number of unique primary_service_regions:", len(unique_regions))
print(unique_regions)


Number of unique primary_service_regions: 851
['N Fort Myers', 'Lehigh Acres', 'Fort Myers', 'Fort Myers Beach', 'Bonita Springs-Estero', 'East Fort Myers', 'Cape Coral', 'Fort Myers Islands', 'FL - Alva & LaBelle', 'Ave Maria', 'North Naples', 'Naples', 'Marco Island', 'Immokalee', 'Charlotte County Central', 'Punta Gorda', 'Charlotte County East', 'Charlotte County West', 'Auburndale and Winter Haven', 'Tampa-Northwest Suburbs', 'SW Pasco', 'North Pinellas County', 'Central Pinellas-Largo', 'Pinellas Park', 'Seminole', 'Tampa-North Central Suburbs', 'Central Pinellas-Clearwater', 'Safety Harbor', 'Belleair', 'SE Pasco', 'Hernando County Spring Hill', 'Hernando County Brooksville', 'S Central Pasco', 'N Central Pasco', 'NW Pasco', 'NE Pasco', 'West Polk County', 'Tampa-Downtown', 'Midtown', 'Rocky Point', 'St. Petersburg', 'Lake Worth & Lantana', 'Boynton', 'DeLand & Deltona', 'Polk County Southeast', 'Tampa-New Tampa', 'North Sarasota County', 'West Manatee', 'Lakewood Ranch', 'South

In [7]:
import pandas as pd

def get_unique_region_state(df):
    """Return list of unique strings like "region, state" coming from
    df.primary_service_regions + df.office_state."""
    entries = []
    
    for _, row in df.iterrows():
        regions = row['primary_service_regions']
        state = row['office_state']
        
        if pd.isna(regions) or pd.isna(state):
            continue
        
        # Clean state
        state_str = str(state).strip()
        # Regions might be a list or string
        if isinstance(regions, list):
            region_list = regions
        else:
            # convert to string & clean, then split on commas/slashes if multiple
            region_str = str(regions)
            region_str = region_str.replace('[','').replace(']','')
            region_str = region_str.replace("'", "").replace('"','')
            # split on commas or slashes
            region_list = [r.strip() for r in re.split(r'[\,\/]', region_str) if r.strip()]
        
        for reg in region_list:
            entries.append(f"{reg}, {state_str}")
    
    # unique
    unique_entries = list(dict.fromkeys(entries))  # preserves order
    return unique_entries

# usage:
import re
unique_region_state = get_unique_region_state(new_df)
print("Number of unique region,state pairs:", len(unique_region_state))
print(unique_region_state)


Number of unique region,state pairs: 1391
['N Fort Myers, FL', 'Lehigh Acres, FL', 'Fort Myers, FL', 'Fort Myers Beach, FL', 'Bonita Springs-Estero, FL', 'East Fort Myers, FL', 'Cape Coral, FL', 'Fort Myers Islands, FL', 'FL - Alva & LaBelle, FL', 'Ave Maria, FL', 'North Naples, FL', 'Naples, FL', 'Marco Island, FL', 'Immokalee, FL', 'Charlotte County Central, FL', 'Punta Gorda, FL', 'Charlotte County East, FL', 'Charlotte County West, FL', 'Auburndale and Winter Haven, FL', 'Tampa-Northwest Suburbs, Unknown', 'SW Pasco, Unknown', 'North Pinellas County, Unknown', 'Central Pinellas-Largo, Unknown', 'Pinellas Park, Unknown', 'Seminole, Unknown', 'Tampa-North Central Suburbs, Unknown', 'Central Pinellas-Clearwater, Unknown', 'Safety Harbor, Unknown', 'Belleair, Unknown', 'SE Pasco, FL', 'Tampa-North Central Suburbs, FL', 'Hernando County Spring Hill, FL', 'Hernando County Brooksville, FL', 'S Central Pasco, FL', 'N Central Pasco, FL', 'SW Pasco, FL', 'North Pinellas County, FL', 'NW Pasc