In [2]:


from pathlib import Path
import os
import pandas as pd


BASE_DIR = "/kaggle/input/agent-finder-new"  

# Filenames (keep these as your uploaded CSV filenames)
AGENT_DETAILS_FILE = "agents_details_clean.csv"   # agent master (1 row per advertiser_id)
REVIEWS_FILE       = "agent_reviews_flat.csv"     # many rows per advertiser_id (one per review)
ALL_IDS_FILE       = "agent_advertiser_ids.csv"   # OPTIONAL: master list of advertiser_ids

# Helper to load CSV with consistent dtype for the key
def read_csv_keyed(path):
    return pd.read_csv(path, dtype={"advertiser_id": "string"})


In [3]:
details = read_csv_keyed(Path(BASE_DIR) / AGENT_DETAILS_FILE)
reviews = read_csv_keyed(Path(BASE_DIR) / REVIEWS_FILE)

# OPTIONAL: If you maintain a master list of advertiser IDs, load it too.
all_ids_path = Path(BASE_DIR) / ALL_IDS_FILE
all_ids = None
if all_ids_path.exists():
    all_ids = pd.read_csv(all_ids_path, dtype={"advertiser_id": "string"})
    if "advertiser_id" not in all_ids.columns:
        # try to auto-normalize if the column name differs
        cand = [c for c in all_ids.columns if "advertiser" in c.lower() and "id" in c.lower()]
        if cand:
            all_ids = all_ids.rename(columns={cand[0]: "advertiser_id"})
        else:
            all_ids = all_ids.rename(columns={all_ids.columns[0]: "advertiser_id"})
    all_ids["advertiser_id"] = all_ids["advertiser_id"].astype("string")

# Ensure key dtypes
details["advertiser_id"] = details["advertiser_id"].astype("string")
reviews["advertiser_id"] = reviews["advertiser_id"].astype("string")


In [4]:
# Build the complete list of advertiser_ids we care about:
if all_ids is not None:
    # union of ids from details and master ids file
    full_ids = pd.Series(
        pd.unique(pd.concat([details["advertiser_id"], all_ids["advertiser_id"]], ignore_index=True)),
        name="advertiser_id"
    )
else:
    # if you don't have a separate master list, just use the agent details list
    full_ids = details["advertiser_id"].dropna().drop_duplicates().reset_index(drop=True)

# Make a guaranteed "full_agents" master with one row per advertiser_id
full_agents = pd.DataFrame({"advertiser_id": full_ids}).merge(details, on="advertiser_id", how="left")


In [5]:
# A right (or full outer) style merge on advertiser_id:
# - All reviews get their agent details repeated
# - Agents with no reviews appear once with NaN review fields
merged = reviews.merge(full_agents, on="advertiser_id", how="right", indicator=True)

# `_merge` is useful for checking which rows came from each side; keep for now or drop later:
# merged = merged.drop(columns=["_merge"])


In [6]:
n_review_rows = len(reviews)
n_agents_total = full_agents["advertiser_id"].nunique()
n_agents_with_reviews = reviews["advertiser_id"].nunique()
n_agents_without_reviews = n_agents_total - n_agents_with_reviews

print({
    "review_rows_in_reviews_csv": n_review_rows,
    "unique_agents_in_details_or_master_ids": n_agents_total,
    "agents_with_at_least_one_review": n_agents_with_reviews,
    "agents_with_zero_reviews": n_agents_without_reviews,
    "expected_final_rows": n_review_rows + n_agents_without_reviews,
    "actual_final_rows": len(merged),
})


{'review_rows_in_reviews_csv': 7621, 'unique_agents_in_details_or_master_ids': 900, 'agents_with_at_least_one_review': 421, 'agents_with_zero_reviews': 479, 'expected_final_rows': 8100, 'actual_final_rows': 8100}


In [7]:
merged.shape

(8100, 71)

In [8]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings('ignore')

In [9]:
merged.head()

Unnamed: 0,advertiser_id,review_id,rating,comment,display_name,describe_yourself,location,year,transaction_date,started_timestamp,source_id,responsiveness,negotiation_skills,professionalism_communication,market_expertise,link,reply,id,full_name,first_name,last_name,state,usps,first_year,first_month,role,agent_type,is_realtor,person_name,party_id,review_count,agent_rating,recommendations_count,for_sale_price.min,for_sale_price.max,for_sale_price.count,recently_sold.min,recently_sold.max,recently_sold.count,for_sale_price.last_listing_date,recently_sold.last_sold_date,mls_primary_abbrev,mls_abbrevs_joined,mls_licenses_joined,mls_history_joined,mls_history_last_inactivated_abbrev,mls_history_last_inactivated_date,specializations_joined,designations_joined,user_languages,served_areas_as_name_state,marketing_area_cities_as_city_state,zips,office.name,office.fulfillment_id,office_address_joined,office_phone_primary_number,office_phone_primary_type,office_phone_numbers_joined,office_licenses_joined,office_license_states_joined,broker.name,broker.fulfillment_id,phone_primary_number,phone_primary_type,phone_numbers_joined,phone_display_joined,web_url,photo.href,href,_merge
0,3322174,2e598706-de90-466a-af79-a997cba17fd7,5.0,"We enjoyed working with Michelle enormously. She was very professional and responsive to our needs. Her suggestions and follow thru were spot on. We would, and do, recommend her highly.",Cathy,SELLER,"Wasilla, AK",2019.0,,2019-06-26T19:45:46.956Z,RDC,5.0,5.0,5.0,5.0,,,5a7bd38ca5b955001010c2e2,Michelle Crew,,,Alaska,AK,2018.0,1.0,agent,"seller, buyer",True,Michelle Crew,433688402,1,5.0,9,495000,995000,3,0,0,20,2025-10-09T08:17:43Z,2025-09-26,FAR_19911990,FAR_19911990,130203,,,,"Buyers, Sellers, Buyer's agent, Seller's agent",,,"Wasilla_AK, Palmer_AK, Trapper Creek_AK, Willow_AK, Big Lake_AK, Eagle River_AK, Mat-Su Valley, Anchorage","Knik-Fairview_AK, North Lakes_AK, Meadow Lakes_AK, Wasilla_AK, Fishhook_AK, Tanaina_AK, South Lakes_AK, Lakes_AK, Gateway_AK, Willow_AK, Point MacKenzie_AK, Meadow Lake_AK, Houston_AK, Susitna_AK, Big Lake_AK, Farm Loop_AK, Butte_AK, Buffalo Soapstone_AK, Lazy Mountain_AK, Knik River_AK, Sutton-Alpine_AK, Palmer_AK, Petersville_AK, Trapper Creek_AK, Chase_AK, Susitna North_AK, Anchorage_AK, Eagle River_AK","99654, 99623, 99645, 99683, 99688, 99652, 99577",Jack White Real Estate Mat Su,72104,"865 N Seward Meridian Pkwy Suite 200, <NA>, Wasilla, AK 99654, US",(907) 376-2414,Office,(907) 376-2414 (Office),,,Jack White Real Estate - Broker,4845839,(907) 521-6474,Mobile,(907) 521-6474,(907) 521-6474 (Mobile),https://www.realtor.com/realestateagents/Michelle-Crew_Wasilla_AK_3322174_697969698,https://ap.rdcpix.com/fec1e58babac89685390b7376a044f12a-e3977415033s.jpg,www.michellecrew.com,both
1,3123020,,,,,,,,,,,,,,,,,59380ce27a9ec40011f7dc17,Liz Steele,,,Alaska,AK,2009.0,6.0,agent,"buyer, seller",True,Liz Steele,2485273,0,0.0,0,85000,1161000,21,0,0,0,2025-10-09T04:40:28Z,,FAR_19911990,FAR_19911990,17817,,,,"Listing Specialist, Seller's agent, New Construction",,,"Anchorage_AK, Girdwood_AK, Eagle River_AK, Chugiak_AK, Wasilla_AK, Palmer_AK, Alaska","Anchorage_AK, Girdwood_AK, Eagle River_AK, Chugiak_AK, Knik-Fairview_AK, North Lakes_AK, Meadow Lakes_AK, Wasilla_AK, Fishhook_AK, Tanaina_AK, South Lakes_AK, Lakes_AK, Gateway_AK, Farm Loop_AK, Butte_AK, Buffalo Soapstone_AK, Lazy Mountain_AK, Knik River_AK, Sutton-Alpine_AK, Palmer_AK, Willow_AK, Point MacKenzie_AK, Meadow Lake_AK, Houston_AK, Susitna_AK, Big Lake_AK","99501, 99502, 99503, 99504, 99507, 99508, 99515, 99516, 99517, 99518, 99587, 99577, 99567, 99654, 99645, 99623",Keller Williams Realty Alaska Group,2996072,"101 W BENSON BLVD STE 503, <NA>, ANCHORAGE, AK 99503, US",(907) 864-6500,Mobile,(907) 864-6500 (Mobile),,,Keller Williams Realty Alaska - Broker,3122926,(907) 360-3821,Mobile,(907) 360-3821,(907) 360-3821 (Mobile),https://www.realtor.com/realestateagents/Liz-Steele_ANCHORAGE_AK_3123020_088879998,https://ap.rdcpix.com/0ec45d72e952d35c7e533891fd0a4981a-c2021894702s.jpg,http://teamdimmick.com,right_only
2,3704659,9522c15d-5252-4630-96b8-348a85cc0a37,5.0,Rachel provides the best service in representing you. She started with a solid marketing plan and implemented it clearly! \n\nShe got the best price for our home in a competitive market! I highly recommend her!,Don,SELLER,"Wasilla, AK",2023.0,,2023-08-30T10:56:57,RDC,5.0,5.0,5.0,5.0,,,5c7ad42b01d70e001215ce3a,Rachel Cunha,,,Alaska,AK,2017.0,1.0,agent,"buyer, seller",True,Rachel Cunha,391201526,2,5.0,1,25000,230000,4,0,0,20,2025-10-09T04:22:52Z,2025-10-03,FAR_19911990,FAR_19911990,121803,,,,"Selling and Listing homes, Lake Front homes, Alaska Cabins, Buyer's agent, Seller's agent","SFR, C2EX",,"Wasilla_AK, Palmer_AK, Anchorage_AK, Eagle River_AK, Big Lake_AK, Chugiak_AK, Matanuska - Susitna Borough, Municipality of Anchorage","Knik-Fairview_AK, North Lakes_AK, Meadow Lakes_AK, Wasilla_AK, Fishhook_AK, Tanaina_AK, South Lakes_AK, Lakes_AK, Gateway_AK, Farm Loop_AK, Butte_AK, Buffalo Soapstone_AK, Lazy Mountain_AK, Knik River_AK, Sutton-Alpine_AK, Palmer_AK, Anchorage_AK, Eagle River_AK, Willow_AK, Point MacKenzie_AK, Meadow Lake_AK, Houston_AK, Susitna_AK, Big Lake_AK, Chugiak_AK","99654, 99645, 99501, 99577, 99504, 99623, 99652, 99502, 99518, 99515, 99567",Elite Real Estate Group Palmer,3645658,"125 W EVERGREEN AVE STE 201, <NA>, PALMER, AK 99645, USA",(907) 746-7890,Office,(907) 746-7890 (Office),,,Elite Real Estate Group Palmer - Broker,3672558,(907) 521-8635,Mobile,(907) 521-8635,(907) 521-8635 (Mobile),https://www.realtor.com/realestateagents/Rachel-Cunha_PALMER_AK_3704659_691879698,https://ap.rdcpix.com/b72f4eb076c935c483a940c19157a2b5a-e3852147312s.jpg,http://rachelcunha.eliterealestatealaska.com,both
3,3704659,8eae5d62-8913-449d-b8d8-36f4489750f1,5.0,Rachel was an amazing realtor. She was very responsive to all our needs and always acted quickly. She was able to get us into our home under the exact conditions and price that we wanted. Couldn't ask for anything more!,Ed,BUYER,"Palmer, AK",2023.0,,2023-06-14T08:15:37,RDC,5.0,5.0,5.0,5.0,,,5c7ad42b01d70e001215ce3a,Rachel Cunha,,,Alaska,AK,2017.0,1.0,agent,"buyer, seller",True,Rachel Cunha,391201526,2,5.0,1,25000,230000,4,0,0,20,2025-10-09T04:22:52Z,2025-10-03,FAR_19911990,FAR_19911990,121803,,,,"Selling and Listing homes, Lake Front homes, Alaska Cabins, Buyer's agent, Seller's agent","SFR, C2EX",,"Wasilla_AK, Palmer_AK, Anchorage_AK, Eagle River_AK, Big Lake_AK, Chugiak_AK, Matanuska - Susitna Borough, Municipality of Anchorage","Knik-Fairview_AK, North Lakes_AK, Meadow Lakes_AK, Wasilla_AK, Fishhook_AK, Tanaina_AK, South Lakes_AK, Lakes_AK, Gateway_AK, Farm Loop_AK, Butte_AK, Buffalo Soapstone_AK, Lazy Mountain_AK, Knik River_AK, Sutton-Alpine_AK, Palmer_AK, Anchorage_AK, Eagle River_AK, Willow_AK, Point MacKenzie_AK, Meadow Lake_AK, Houston_AK, Susitna_AK, Big Lake_AK, Chugiak_AK","99654, 99645, 99501, 99577, 99504, 99623, 99652, 99502, 99518, 99515, 99567",Elite Real Estate Group Palmer,3645658,"125 W EVERGREEN AVE STE 201, <NA>, PALMER, AK 99645, USA",(907) 746-7890,Office,(907) 746-7890 (Office),,,Elite Real Estate Group Palmer - Broker,3672558,(907) 521-8635,Mobile,(907) 521-8635,(907) 521-8635 (Mobile),https://www.realtor.com/realestateagents/Rachel-Cunha_PALMER_AK_3704659_691879698,https://ap.rdcpix.com/b72f4eb076c935c483a940c19157a2b5a-e3852147312s.jpg,http://rachelcunha.eliterealestatealaska.com,both
4,3650883,,,,,,,,,,,,,,,,,5bf4ac68b2da340011fc81bd,Karen Ross,,,Alaska,AK,2018.0,9.0,agent,"buyer, seller",True,Karen Ross,468782506,0,0.0,0,4900,4900,1,0,0,37,2025-10-09T03:02:30Z,2025-09-02,FAR_19911990,FAR_19911990,138580,,,,"Certified Residential Specialist, Seller Representative Specialist, Accredited Buyer's Representative, Remote Recreational Properties, Distance Buyers, First Time Homebuyers, Multi-Family Properties, Commercial Properties","GRI, ABR, SRS, C2EX",,"Anchorage_AK, Chugiak_AK, Eagle River_AK, Girdwood_AK, Houston_AK, Indian_AK, JBER_AK, Palmer_AK, Sutton_AK, Takotna_AK, Talkeetna_AK, Trapper Creek_AK, Wasilla_AK, Willow_AK, Anchorage~Eagle River~Chugiak~Palmer~Sutton~Wasilla~Houston~Big Lake~Willow","Anchorage_AK, Prudhoe Bay_AK, Ivanof Bay_AK, Chugiak_AK, Eagle River_AK, Girdwood_AK, Meadow Lake_AK, Houston_AK, Indian_AK, JBER_AK, North Lakes_AK, Farm Loop_AK, Butte_AK, Buffalo Soapstone_AK, Lazy Mountain_AK, Knik River_AK, Sutton-Alpine_AK, Fishhook_AK, South Lakes_AK, Lakes_AK, Gateway_AK, Palmer_AK, Chickaloon_AK, Sutton_AK, Glacier View_AK, Takotna_AK, Talkeetna_AK, Susitna North_AK, Chase_AK, Petersville_AK, Trapper Creek_AK, Knik-Fairview_AK, Meadow Lakes_AK, Wasilla_AK, Willow_AK, Point MacKenzie_AK, Susitna_AK, Big Lake_AK, Tanaina_AK","99501, 99502, 99503, 99504, 99507, 99508, 99513, 99515, 99516, 99517, 99518, 99519, 99530, 99695, 99567, 99577, 99587, 99694, 99540, 99505, 99506, 99645, 99674, 99675, 99676, 99683, 99623, 99654, 99688",Jack White Real Estate Mat Su,72104,"865 N Seward Meridian Pkwy Suite 200, <NA>, Wasilla, AK 99654, US",(907) 376-2414,Office,(907) 376-2414 (Office),,,Jack White Real Estate - Broker,4845839,(907) 268-4321,Mobile,(907) 268-4321,(907) 268-4321 (Mobile),https://www.realtor.com/realestateagents/Karen-Ross_Wasilla_AK_3650883_159799698,https://ap.rdcpix.com/31d40ff965cbf73a8d23d03c8a48d39ca-e363323630s.jpg,https://kareneross.jackwhite.com,right_only


In [10]:
for col in merged.columns:
    print(col)

advertiser_id
review_id
rating
comment
display_name
describe_yourself
location
year
transaction_date
started_timestamp
source_id
responsiveness
negotiation_skills
professionalism_communication
market_expertise
link
reply
id
full_name
first_name
last_name
state
usps
first_year
first_month
role
agent_type
is_realtor
person_name
party_id
review_count
agent_rating
recommendations_count
for_sale_price.min
for_sale_price.max
for_sale_price.count
recently_sold.min
recently_sold.max
recently_sold.count
for_sale_price.last_listing_date
recently_sold.last_sold_date
mls_primary_abbrev
mls_abbrevs_joined
mls_licenses_joined
mls_history_joined
mls_history_last_inactivated_abbrev
mls_history_last_inactivated_date
specializations_joined
designations_joined
user_languages
served_areas_as_name_state
marketing_area_cities_as_city_state
zips
office.name
office.fulfillment_id
office_address_joined
office_phone_primary_number
office_phone_primary_type
office_phone_numbers_joined
office_licenses_joined
offi

In [11]:
merged.isnull().sum()

advertiser_id                             0
review_id                               479
rating                                  479
comment                                1581
display_name                           1887
describe_yourself                       481
location                                493
year                                   4846
transaction_date                       3271
started_timestamp                       480
source_id                               479
responsiveness                         2540
negotiation_skills                     2540
professionalism_communication          2540
market_expertise                       2540
link                                   8100
reply                                  8088
id                                        0
full_name                                 1
first_name                             2580
last_name                              2582
state                                     0
usps                            

In [12]:
merged[["full_name", "first_name", "last_name", "person_name"]].head(30)

Unnamed: 0,full_name,first_name,last_name,person_name
0,Michelle Crew,,,Michelle Crew
1,Liz Steele,,,Liz Steele
2,Rachel Cunha,,,Rachel Cunha
3,Rachel Cunha,,,Rachel Cunha
4,Karen Ross,,,Karen Ross
5,Jessica Horwatt,Jessica,Horwatt,Jessica Horwatt
6,Jessica Horwatt,Jessica,Horwatt,Jessica Horwatt
7,Maximillian Lowe,,,Maximillian Lowe
8,Jim Black,Jim,Black,James W. Black III
9,ELIZABETH SCHOK,ELIZABETH,SCHOK,Elizabeth Schok


In [13]:
df= merged.copy()

In [14]:
# --- STEP 1: If full_name is missing, fill it using person_name as fallback ---
df["full_name"] = df["full_name"].fillna(df["person_name"])

# --- STEP 2: Normalize whitespace and capitalization ---
df["full_name"] = (
    df["full_name"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .str.title()  # Convert to Title Case: "MICHELLE CREW" → "Michelle Crew"
)


# --- STEP 3: Drop old name columns ---
df = df.drop(columns=["first_name", "last_name", "person_name"], errors="ignore")

In [15]:
df.shape

(8100, 68)

In [16]:
df["full_name"].head(40)

0               Michelle Crew
1                  Liz Steele
2                Rachel Cunha
3                Rachel Cunha
4                  Karen Ross
5             Jessica Horwatt
6             Jessica Horwatt
7            Maximillian Lowe
8                   Jim Black
9             Elizabeth Schok
10                Liz Jozwiak
11         Christina Edenshaw
12         Christina Edenshaw
13         Christina Edenshaw
14         Christina Edenshaw
15    Holly Homes Real Estate
16          Kaitlin Hotchkiss
17          Kaitlin Hotchkiss
18          Kaitlin Hotchkiss
19          Kaitlin Hotchkiss
20          Kaitlin Hotchkiss
21          Kaitlin Hotchkiss
22          Kaitlin Hotchkiss
23          Kaitlin Hotchkiss
24          Kaitlin Hotchkiss
25          Kaitlin Hotchkiss
26          Kaitlin Hotchkiss
27          Kaitlin Hotchkiss
28          Kaitlin Hotchkiss
29          Kaitlin Hotchkiss
30          Kaitlin Hotchkiss
31          Kaitlin Hotchkiss
32          Kaitlin Hotchkiss
33        

In [17]:
id_cols = [c for c in merged.columns if "id" in c.lower()]
df[id_cols].head(30)

Unnamed: 0,advertiser_id,review_id,source_id,id,party_id,office.fulfillment_id,broker.fulfillment_id
0,3322174,2e598706-de90-466a-af79-a997cba17fd7,RDC,5a7bd38ca5b955001010c2e2,433688402,72104,4845839
1,3123020,,,59380ce27a9ec40011f7dc17,2485273,2996072,3122926
2,3704659,9522c15d-5252-4630-96b8-348a85cc0a37,RDC,5c7ad42b01d70e001215ce3a,391201526,3645658,3672558
3,3704659,8eae5d62-8913-449d-b8d8-36f4489750f1,RDC,5c7ad42b01d70e001215ce3a,391201526,3645658,3672558
4,3650883,,,5bf4ac68b2da340011fc81bd,468782506,72104,4845839
5,74491,1965bf8c-2ab0-4fdc-91e9-d44cd01e3b76,RDC,5673ff4b7e54f701001e3559,4276654,0,0
6,74491,3021ed38-4112-47de-a381-804b2c33149b,RDC,5673ff4b7e54f701001e3559,4276654,0,0
7,100255586,78f5a9f4-b63f-4aa0-ad49-0d3bb96a163e,RDC,64cc036d5d196e20a38fc109,532803527,0,0
8,1892278,,,56d61ffbb5cc660100bcef4c,38789468,106374,4172446
9,503920,,,56ac02a289a68901006bef97,4686959,0,0


In [18]:
# Drop the two redundant columns
df = df.drop(columns=["id", "party_id","display_name","link","reply" ], errors="ignore")

In [19]:
df.shape

(8100, 63)

In [20]:
for col in df.columns:
    print(col)

advertiser_id
review_id
rating
comment
describe_yourself
location
year
transaction_date
started_timestamp
source_id
responsiveness
negotiation_skills
professionalism_communication
market_expertise
full_name
state
usps
first_year
first_month
role
agent_type
is_realtor
review_count
agent_rating
recommendations_count
for_sale_price.min
for_sale_price.max
for_sale_price.count
recently_sold.min
recently_sold.max
recently_sold.count
for_sale_price.last_listing_date
recently_sold.last_sold_date
mls_primary_abbrev
mls_abbrevs_joined
mls_licenses_joined
mls_history_joined
mls_history_last_inactivated_abbrev
mls_history_last_inactivated_date
specializations_joined
designations_joined
user_languages
served_areas_as_name_state
marketing_area_cities_as_city_state
zips
office.name
office.fulfillment_id
office_address_joined
office_phone_primary_number
office_phone_primary_type
office_phone_numbers_joined
office_licenses_joined
office_license_states_joined
broker.name
broker.fulfillment_id
phone_prim

In [21]:
df.isnull().sum()

advertiser_id                             0
review_id                               479
rating                                  479
comment                                1581
describe_yourself                       481
location                                493
year                                   4846
transaction_date                       3271
started_timestamp                       480
source_id                               479
responsiveness                         2540
negotiation_skills                     2540
professionalism_communication          2540
market_expertise                       2540
full_name                                 0
state                                     0
usps                                      0
first_year                                2
first_month                               2
role                                      0
agent_type                              870
is_realtor                                0
review_count                    

In [22]:
df["agent_type"].nunique()

4

In [23]:
print(df["agent_type"].unique())

['seller, buyer' 'buyer, seller' nan 'seller' 'buyer']


In [24]:
# Replace variants and unify
df["agent_type"] = (
    df["agent_type"]
    .str.lower()
    .replace({
        "seller, buyer": "both",
        "buyer, seller": "both",
        "seller": "seller",
        "buyer": "buyer"
    })
)

# Optional: confirm the result
print(df["agent_type"].unique())

['both' nan 'seller' 'buyer']


In [25]:
# Rename review-related columns for clarity
df = df.rename(columns={
    "rating": "review_rating",
    "comment": "review_comment",
    "describe_yourself": "reviewer_type",
    "location": "reviewer_location",
    "year": "review_year",
    "transaction_date": "review_transaction_date",
    "started_timestamp": "review_started_timestamp",
    "source_id": "review_source_id",
})



In [26]:
df[["review_year", "review_transaction_date", "review_started_timestamp", "review_source_id"]].head(30)


Unnamed: 0,review_year,review_transaction_date,review_started_timestamp,review_source_id
0,2019.0,,2019-06-26T19:45:46.956Z,RDC
1,,,,
2,2023.0,,2023-08-30T10:56:57,RDC
3,2023.0,,2023-06-14T08:15:37,RDC
4,,,,
5,2018.0,,2019-03-08T03:58:48.741Z,RDC
6,2017.0,,2017-12-15T20:01:17.093Z,RDC
7,2024.0,,2024-07-21T13:05:25,RDC
8,,,,
9,,,,


In [29]:
# Convert review_started_timestamp → datetime
df["review_date"] = pd.to_datetime(df["review_started_timestamp"], errors="coerce").dt.date

In [28]:
df.dtypes

advertiser_id                          string[python]
review_id                                      object
review_rating                                 float64
review_comment                                 object
reviewer_type                                  object
reviewer_location                              object
review_year                                    object
review_transaction_date                        object
review_started_timestamp                       object
review_source_id                               object
responsiveness                                float64
negotiation_skills                            float64
professionalism_communication                 float64
market_expertise                              float64
full_name                                      object
state                                          object
usps                                           object
first_year                                    float64
first_month                 

In [30]:
# Drop unnecessary date/source columns
df = df.drop(columns=[
    "review_year",
    "review_transaction_date",
    "review_started_timestamp",
    "review_source_id"
])


In [31]:
df.shape

(8100, 60)

In [32]:
df.isnull().sum()

advertiser_id                             0
review_id                               479
review_rating                           479
review_comment                         1581
reviewer_type                           481
reviewer_location                       493
responsiveness                         2540
negotiation_skills                     2540
professionalism_communication          2540
market_expertise                       2540
full_name                                 0
state                                     0
usps                                      0
first_year                                2
first_month                               2
role                                      0
agent_type                              870
is_realtor                                0
review_count                              0
agent_rating                              0
recommendations_count                     0
for_sale_price.min                        0
for_sale_price.max              

In [35]:
df["role"].nunique()

1

In [36]:
df["user_languages"].nunique()

24

In [37]:
print(df["user_languages"].unique())

[nan 'English' 'English, Spanish' 'Spanish' 'Chinese, Mandarin' 'Korean'
 'ASL American Sign Language, English, Spanish' 'Chinese, English'
 'Farsi, English' 'Spanish, English'
 'Bulgarian, English, German, Spanish, French, Italian' 'English, Korean'
 'Nepali, Hindi, Chinese, Swahili' 'Spanish, French'
 'Albanian, English, Russian, Italian' 'Tagalog, English' 'Bengali'
 'German, Spanish, Portuguese' 'English, Swedish'
 'Spanish, Portuguese, English' 'French' 'English, Arabic, French'
 'Hindi, Urdu, English, Gujarati'
 'Spanish, Chinese, Bosnian, Russian, English'
 'Chinese, English, Japanese, Mandarin']


In [40]:
# Replace NaN with "English"
df["user_languages"] = df["user_languages"].fillna("English")

def normalize_languages(cell):
    """Split by comma, strip, sort alphabetically, and join back."""
    if pd.isna(cell) or str(cell).strip() == "":
        return "English"
    langs = [x.strip() for x in str(cell).split(",") if x.strip()]
    langs_sorted = sorted(set(langs), key=lambda x: x.lower())
    return ", ".join(langs_sorted)

# Apply cleaning and normalization
df["user_languages"] = df["user_languages"].apply(normalize_languages)

In [41]:
print(df["user_languages"].unique())

['English' 'English, Spanish' 'Spanish' 'Chinese, Mandarin' 'Korean'
 'ASL American Sign Language, English, Spanish' 'Chinese, English'
 'English, Farsi' 'Bulgarian, English, French, German, Italian, Spanish'
 'English, Korean' 'Chinese, Hindi, Nepali, Swahili' 'French, Spanish'
 'Albanian, English, Italian, Russian' 'English, Tagalog' 'Bengali'
 'German, Portuguese, Spanish' 'English, Swedish'
 'English, Portuguese, Spanish' 'French' 'Arabic, English, French'
 'English, Gujarati, Hindi, Urdu'
 'Bosnian, Chinese, English, Russian, Spanish'
 'Chinese, English, Japanese, Mandarin']


In [43]:
drop_cols = [
    
    "for_sale_price.last_listing_date", 

    # MLS-level administrative junk (too sparse / internal only)
    "mls_primary_abbrev", "mls_abbrevs_joined", "mls_licenses_joined",
    "mls_history_joined", "mls_history_last_inactivated_abbrev",
    "mls_history_last_inactivated_date",

    # Repetitive office license & phone fields (display-level noise)
    "office_phone_primary_type", "office_phone_numbers_joined",
    "office_licenses_joined", "office_license_states_joined",

    # Personal contact duplicates
    "phone_numbers_joined", "phone_display_joined",

    # Technical or merge markers
     "_merge"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

In [44]:
df.shape

(8100, 46)

In [45]:
df.isnull().sum()

advertiser_id                             0
review_id                               479
review_rating                           479
review_comment                         1581
reviewer_type                           481
reviewer_location                       493
responsiveness                         2540
negotiation_skills                     2540
professionalism_communication          2540
market_expertise                       2540
full_name                                 0
state                                     0
usps                                      0
first_year                                2
first_month                               2
role                                      0
agent_type                              870
is_realtor                                0
review_count                              0
agent_rating                              0
recommendations_count                     0
for_sale_price.min                        0
for_sale_price.max              

In [47]:
df[["first_year", "first_month"]].head(40)

Unnamed: 0,first_year,first_month
0,2018.0,1.0
1,2009.0,6.0
2,2017.0,1.0
3,2017.0,1.0
4,2018.0,9.0
5,1999.0,0.0
6,1999.0,0.0
7,2023.0,1.0
8,6.0,0.0
9,2005.0,9.0


In [49]:
import numpy as np
# ensure numeric
fy = pd.to_numeric(df["first_year"], errors="coerce")
fm = pd.to_numeric(df["first_month"], errors="coerce")

# current year for sanity checks
curr_year = pd.Timestamp.today().year

# validity rules
valid_year  = fy.between(1900, curr_year)          # e.g., 6 or 0 => invalid
valid_month = fm.between(1, 12)                     # 0 or >12 => invalid
not_zerozero = ~((fy.fillna(0) == 0) & (fm.fillna(0) == 0))

valid_mask = valid_year & valid_month & not_zerozero

# build start dates; invalid -> NaT
start_dates = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")
start_dates.loc[valid_mask] = pd.to_datetime(
    {
        "year":  fy.loc[valid_mask].astype(int),
        "month": fm.loc[valid_mask].astype(int),
        "day":   1
    },
    errors="coerce"
)

# compute years of experience (negative/future -> NaN)
today = pd.Timestamp.today().normalize()
exp_years = (today - start_dates) / np.timedelta64(1, "D") / 365.25
exp_years = exp_years.mask(exp_years < 0)  # future start dates -> NaN

# round to 1 decimal (optional)
df["experience_years"] = exp_years.round(2)

In [50]:
df["experience_years"].head(40)

0      7.77
1     16.36
2      8.77
3      8.77
4      7.11
5       NaN
6       NaN
7      2.77
8       NaN
9     20.11
10    14.19
11     3.53
12     3.53
13     3.53
14     3.53
15      NaN
16     4.86
17     4.86
18     4.86
19     4.86
20     4.86
21     4.86
22     4.86
23     4.86
24     4.86
25     4.86
26     4.86
27     4.86
28     4.86
29     4.86
30     4.86
31     4.86
32     4.86
33     4.86
34     4.86
35     4.86
36     4.86
37     4.86
38     4.86
39    20.77
Name: experience_years, dtype: float64

In [51]:
# Drop unnecessary date/source columns
df = df.drop(columns=[
    "first_year",
    "first_month",
])


In [52]:
for col in df.columns:
    print(col)

advertiser_id
review_id
review_rating
review_comment
reviewer_type
reviewer_location
responsiveness
negotiation_skills
professionalism_communication
market_expertise
full_name
state
usps
role
agent_type
is_realtor
review_count
agent_rating
recommendations_count
for_sale_price.min
for_sale_price.max
for_sale_price.count
recently_sold.min
recently_sold.max
recently_sold.count
recently_sold.last_sold_date
specializations_joined
designations_joined
user_languages
served_areas_as_name_state
marketing_area_cities_as_city_state
zips
office.name
office.fulfillment_id
office_address_joined
office_phone_primary_number
broker.name
broker.fulfillment_id
phone_primary_number
phone_primary_type
web_url
photo.href
href
review_date
experience_years


In [53]:
df.shape

(8100, 45)

In [54]:
df = df.drop_duplicates()

In [55]:
df.shape

(8100, 45)

In [56]:
df = df.drop_duplicates(subset=["review_id"], keep="first")

In [57]:
df.shape

(7622, 45)

In [58]:
# Drop duplicate reviews based on review_id
df_clean = df.drop_duplicates(subset=["review_id"], keep="first")

# Save cleaned dataset
output_path = "agents_reviews_merged_clean.csv"
df_clean.to_csv(output_path, index=False, encoding="utf-8")

print(f"✅ Cleaned DataFrame saved successfully as: {output_path}")
print("Final shape:", df_clean.shape)


✅ Cleaned DataFrame saved successfully as: agents_reviews_merged_clean.csv
Final shape: (7622, 45)
