# Merge the 500 labeled rows to our entire dataset

In [511]:
import pandas as pd
df_labeled_500 = pd.read_csv("../datasets/filtered_data_with_main_address_per_user.csv")
df_all_rows_with_gemini_predictions = pd.read_csv("../datasets/filtered_data_with_main_address_per_user_gemini_all.csv")

In [512]:
df_labeled_500.shape

(2153, 30)

In [513]:
df_all_rows_with_gemini_predictions.shape

(158717, 26)

In [514]:
df_all_rows_with_gemini_predictions.head()

Unnamed: 0.1,Unnamed: 0,caid,quarter,cluster,total_pings,unique_days,unique_hours,zipcode,centroid_latitude,centroid_longitude,log_total_pings,total_days_in_quarter,consistency_score,day_consistency_score,evening_consistency_score,night_consistency_score,time_window_coverage,weekend_focus_score,dominance_score,hour_entropy,max_consecutive_hours,median_income_household_2023,matches_known_address,matched_address,is_main_address_gemini_top5,main_address_justification_gemini_top5
0,0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,0,32,3,3,90020,34.065744,-118.29635,3.496508,4,0.75,1.0,0.5,,0.666667,,0.666667,0.900256,253,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 has the highest dominance score (0.6...
1,1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1,9,1,1,90002,33.959281,-118.253437,2.302585,4,0.25,,0.5,,0.333333,,0.1875,,1,56158,0,,0,
2,2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,2,7,1,1,91606,34.182663,-118.383647,2.079442,4,0.25,,0.5,,0.333333,,0.145833,,1,66884,0,,0,
3,3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,0,106,2,12,90020,34.065744,-118.29635,4.672829,3,0.666667,1.0,,0.666667,0.666667,,0.946429,2.257982,87,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 is the better choice for the main ho...
4,4,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,3,6,1,2,90020,34.065623,-118.2925,1.94591,3,0.333333,,,0.333333,0.333333,,0.053571,0.450561,4,55832,0,,0,


## Remove is_main_address and main_address_justification, is_residential since they contain is_residential

In [515]:
df_labeled_500.drop(columns=["is_main_address", "main_address_justification","is_residential"], inplace=True)

# Merge datasets

In [516]:
# Step 1: Drop the first column from each dataframe (by position)
df_gemini = df_all_rows_with_gemini_predictions.iloc[:, 1:]
df_labeled = df_labeled_500.iloc[:, 2:]

# Step 2: Remove overlapping columns (except keys)
overlapping_cols = [
    col for col in df_labeled.columns 
    if col in df_gemini.columns and col not in ["caid", "quarter","cluster"]
]
df_labeled_clean = df_labeled.drop(columns=overlapping_cols)

# Step 3: Merge cleanly on caid + quarter
df_final = pd.merge(df_gemini, df_labeled_clean, on=["caid", "quarter","cluster"], how="left")
df_final = df_final.sort_values(["caid", "quarter", "dominance_score"], ascending=[True, True, False])



In [517]:
df_final.head()

Unnamed: 0,caid,quarter,cluster,total_pings,unique_days,unique_hours,zipcode,centroid_latitude,centroid_longitude,log_total_pings,total_days_in_quarter,consistency_score,day_consistency_score,evening_consistency_score,night_consistency_score,time_window_coverage,weekend_focus_score,dominance_score,hour_entropy,max_consecutive_hours,median_income_household_2023,matches_known_address,matched_address,is_main_address_gemini_top5,main_address_justification_gemini_top5,address,search_results,is_main_address_no_residential,main_address_justification_no_residential
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,0,32,3,3,90020,34.065744,-118.29635,3.496508,4,0.75,1.0,0.5,,0.666667,,0.666667,0.900256,253,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 has the highest dominance score (0.6...,"450 S Kenmore Ave #104, Los Angeles, CA 90020,...","Title: Apartment 450 S Kenmore Ave Apt 412, Lo...",1.0,Cluster 0 was selected as the main home addres...
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1,9,1,1,90002,33.959281,-118.253437,2.302585,4,0.25,,0.5,,0.333333,,0.1875,,1,56158,0,,0,,"1214 1/2 E 87th St, Los Angeles, CA 90002, USA","Title: 1232 E 87th Pl, Los Angeles, CA 90002 |...",0.0,
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,2,7,1,1,91606,34.182663,-118.383647,2.079442,4,0.25,,0.5,,0.333333,,0.145833,,1,66884,0,,0,,"6137 Beck Ave, North Hollywood, CA 91606, USA",Title: National Directory of Mental Health Tre...,0.0,
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,0,106,2,12,90020,34.065744,-118.29635,4.672829,3,0.666667,1.0,,0.666667,0.666667,,0.946429,2.257982,87,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 is the better choice for the main ho...,,,,
4,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,3,6,1,2,90020,34.065623,-118.2925,1.94591,3,0.333333,,,0.333333,0.333333,,0.053571,0.450561,4,55832,0,,0,,,,,


In [518]:
df_final.shape

(158717, 29)

# MULTICLASS

# Pivot dataset

In [446]:
import pandas as pd
import numpy as np

# ---------------------------------------
# 1. Load and prepare data
# ---------------------------------------
df = df_final.copy()

# Features to use per cluster
features = [
    'day_consistency_score', 'evening_consistency_score', 'night_consistency_score',
    'log_total_pings', 'consistency_score', 'unique_hours', 'max_consecutive_hours',
    'dominance_score', 'time_window_coverage','hour_entropy',
]

# ---------------------------------------
# 2. Sort and assign cluster index per caid+quarter
# ---------------------------------------
df = df.sort_values(["caid", "quarter", "dominance_score"], ascending=[True, True, False])
df["cluster_idx"] = df.groupby(["caid", "quarter"]).cumcount()

# ---------------------------------------
# 3. Pivot to wide format per caid + quarter
# ---------------------------------------
pivot_df = df.pivot(index=["caid", "quarter"], columns="cluster_idx", values=features)
pivot_df.columns = [f"c{col[1]}_{col[0]}" for col in pivot_df.columns]
pivot_df.reset_index(inplace=True)

# ---------------------------------------
# 4. Create label from trusted addresses (no_residential flag)
# ---------------------------------------
labels = df[df["is_main_address_no_residential"] == 1][["caid", "quarter", "cluster_idx"]]
labels = labels.rename(columns={"cluster_idx": "label"})

# ---------------------------------------
# 5. Merge pivoted features with labels
# ---------------------------------------
df_merge = pivot_df.merge(labels, on=["caid", "quarter"], how="left")

print("✅ Done: Pivoted by (caid, quarter) and created trusted labels.")



✅ Done: Pivoted by (caid, quarter) and created trusted labels.


In [447]:
df_merge.head()

Unnamed: 0,caid,quarter,c0_day_consistency_score,c1_day_consistency_score,c2_day_consistency_score,c3_day_consistency_score,c4_day_consistency_score,c0_evening_consistency_score,c1_evening_consistency_score,c2_evening_consistency_score,...,c1_time_window_coverage,c2_time_window_coverage,c3_time_window_coverage,c4_time_window_coverage,c0_hour_entropy,c1_hour_entropy,c2_hour_entropy,c3_hour_entropy,c4_hour_entropy,label
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1.0,,,,,0.5,0.5,0.5,...,0.333333,0.333333,,,0.900256,,,,,0.0
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,1.0,,,,,,,,...,0.333333,,,,2.257982,0.450561,,,,
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,8,1.0,,,,,0.5,0.5,,...,0.333333,,,,2.639338,,,,,
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,11,0.75,0.25,,,,,,,...,0.333333,,,,2.40792,,,,,
4,000c95192187099854c0562841a02cf1185e5dfffd4f6a...,2,0.5,0.75,0.5,0.5,0.25,,,,...,0.333333,0.333333,0.333333,0.333333,0.679193,0.636514,0.304636,0.529706,,


# Predict

In [448]:
import lightgbm as lgb

# ---------------------------------------
# 6. Define expected input features (same order used during training!)
# ---------------------------------------
max_clusters = df["cluster_idx"].max()
cluster_indices = list(range(max_clusters + 1))

# You only use `features` for prediction, not all of df_final
predict_features = features
expected_feature_cols = [f"c{i}_{f}" for i in cluster_indices for f in predict_features]

# Fill any missing columns with NaN (to match model expectations)
for col in expected_feature_cols:
    if col not in df_merge.columns:
        df_merge[col] = np.nan

# Ensure numeric types
df_merge[expected_feature_cols] = df_merge[expected_feature_cols].apply(pd.to_numeric, errors="coerce")

# ---------------------------------------
# 7. Predict for rows without label (i.e., unlabeled caid+quarter)
# ---------------------------------------
df_unlabeled = df_merge[df_merge["label"].isna()].copy()
X_unlabeled = df_unlabeled[expected_feature_cols]

# Load pretrained model
booster = lgb.Booster(model_file="lgbm_multiclass_main_address_model.txt")

# Predict class probabilities
pred_probs = booster.predict(X_unlabeled)
df_unlabeled["predicted_label"] = np.argmax(pred_probs, axis=1)

# ---------------------------------------
# 8. Merge predictions back
# ---------------------------------------
df_pred = df_unlabeled[["caid", "quarter", "predicted_label"]]
df_merge = df_merge.merge(df_pred, on=["caid", "quarter"], how="left")

# Final label: use original label if available, else predicted
df_merge["final_label"] = df_merge["label"]
df_merge.loc[df_merge["final_label"].isna(), "final_label"] = df_merge["predicted_label"]
df_merge["final_label"] = df_merge["final_label"].astype(int)

df_merge.drop(columns=["predicted_label","label"], inplace=True)
print("✅ Prediction complete. Final main cluster index per caid+quarter is in 'final_label'")


✅ Prediction complete. Final main cluster index per caid+quarter is in 'final_label'


In [449]:
df_merge.head()

Unnamed: 0,caid,quarter,c0_day_consistency_score,c1_day_consistency_score,c2_day_consistency_score,c3_day_consistency_score,c4_day_consistency_score,c0_evening_consistency_score,c1_evening_consistency_score,c2_evening_consistency_score,...,c1_time_window_coverage,c2_time_window_coverage,c3_time_window_coverage,c4_time_window_coverage,c0_hour_entropy,c1_hour_entropy,c2_hour_entropy,c3_hour_entropy,c4_hour_entropy,final_label
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1.0,,,,,0.5,0.5,0.5,...,0.333333,0.333333,,,0.900256,,,,,0
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,1.0,,,,,,,,...,0.333333,,,,2.257982,0.450561,,,,3
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,8,1.0,,,,,0.5,0.5,,...,0.333333,,,,2.639338,,,,,3
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,11,0.75,0.25,,,,,,,...,0.333333,,,,2.40792,,,,,3
4,000c95192187099854c0562841a02cf1185e5dfffd4f6a...,2,0.5,0.75,0.5,0.5,0.25,,,,...,0.333333,0.333333,0.333333,0.333333,0.679193,0.636514,0.304636,0.529706,,3


In [450]:
import pandas as pd

# Load Gemini predictions
df_gemini = df_final.copy()

# Sort and assign cluster_idx per caid+quarter
df_gemini = df_gemini.sort_values(["caid", "quarter", "dominance_score"], ascending=[True, True, False])
df_gemini["cluster_idx"] = df_gemini.groupby(["caid", "quarter"]).cumcount()

# Extract Gemini-selected clusters
df_gemini_main = df_gemini[df_gemini["is_main_address_gemini_top5"] == 1][["caid", "quarter", "cluster", "cluster_idx"]]
df_gemini_main = df_gemini_main.rename(columns={
    "cluster_idx": "gemini_cluster_idx",
    "cluster": "gemini_cluster"
})

# Merge into df_merge
df_merge = df_merge.merge(df_gemini_main, on=["caid", "quarter"], how="left")

# Move Gemini columns to the end, but right next to each other
cols = df_merge.columns.tolist()

# Remove if already present
cols.remove("gemini_cluster_idx")
cols.remove("gemini_cluster")

# Reinsert right before the last few columns
insertion_point = len(cols)  # or adjust if needed
cols.insert(insertion_point, "gemini_cluster_idx")
cols.insert(insertion_point + 1, "gemini_cluster")

df_merge = df_merge[cols]

print("✅ Gemini cluster columns added and reordered to appear side-by-side at the end.")


✅ Gemini cluster columns added and reordered to appear side-by-side at the end.


In [458]:
df_merge.head()

Unnamed: 0,caid,quarter,c0_day_consistency_score,c1_day_consistency_score,c2_day_consistency_score,c3_day_consistency_score,c4_day_consistency_score,c0_evening_consistency_score,c1_evening_consistency_score,c2_evening_consistency_score,c3_evening_consistency_score,c4_evening_consistency_score,c0_night_consistency_score,c1_night_consistency_score,c2_night_consistency_score,c3_night_consistency_score,c4_night_consistency_score,c0_log_total_pings,c1_log_total_pings,c2_log_total_pings,c3_log_total_pings,c4_log_total_pings,c0_consistency_score,c1_consistency_score,c2_consistency_score,c3_consistency_score,c4_consistency_score,c0_unique_hours,c1_unique_hours,c2_unique_hours,c3_unique_hours,c4_unique_hours,c0_max_consecutive_hours,c1_max_consecutive_hours,c2_max_consecutive_hours,c3_max_consecutive_hours,c4_max_consecutive_hours,c0_dominance_score,c1_dominance_score,c2_dominance_score,c3_dominance_score,c4_dominance_score,c0_time_window_coverage,c1_time_window_coverage,c2_time_window_coverage,c3_time_window_coverage,c4_time_window_coverage,c0_hour_entropy,c1_hour_entropy,c2_hour_entropy,c3_hour_entropy,c4_hour_entropy,final_label,gemini_cluster_idx,gemini_cluster
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1.0,,,,,0.5,0.5,0.5,,,,,,,,3.496508,2.302585,2.079442,,,0.75,0.25,0.25,,,3.0,1.0,1.0,,,253.0,1.0,1.0,,,0.666667,0.1875,0.145833,,,0.666667,0.333333,0.333333,,,0.900256,,,,,0,0,0
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,1.0,,,,,,,,,,0.666667,0.333333,,,,4.672829,1.94591,,,,0.666667,0.333333,,,,12.0,2.0,,,,87.0,4.0,,,,0.946429,0.053571,,,,0.666667,0.333333,,,,2.257982,0.450561,,,,3,0,0
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,8,1.0,,,,,0.5,0.5,,,,1.0,,,,,5.075174,1.94591,,,,0.75,0.25,,,,18.0,1.0,,,,229.0,1.0,,,,0.963636,0.036364,,,,1.0,0.333333,,,,2.639338,,,,,3,0,0
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,11,0.75,0.25,,,,,,,,,1.0,,,,,4.804021,1.94591,,,,0.75,0.25,,,,15.0,1.0,,,,198.0,1.0,,,,0.952756,0.047244,,,,0.666667,0.333333,,,,2.40792,,,,,3,0,0
4,000c95192187099854c0562841a02cf1185e5dfffd4f6a...,2,0.5,0.75,0.5,0.5,0.25,,,,,,,,,,,2.564949,2.564949,2.484907,2.302585,2.079442,0.5,0.75,0.5,0.5,0.25,2.0,2.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.206897,0.206897,0.189655,0.155172,0.12069,0.333333,0.333333,0.333333,0.333333,0.333333,0.679193,0.636514,0.304636,0.529706,,3,1,2


In [459]:
# Only consider rows where Gemini made a prediction
df_valid = df_merge[df_merge["gemini_cluster_idx"].notna()].copy()

# Compare final_label with gemini_cluster_idx
df_valid["disagree"] = df_valid["final_label"] != df_valid["gemini_cluster_idx"]

# Compute percentage disagreement
disagreement_rate = df_valid["disagree"].mean() * 100

print(f"❌ Disagreement between model and Gemini: {disagreement_rate:.2f}%")


❌ Disagreement between model and Gemini: 97.42%


In [460]:
df_final

Unnamed: 0,caid,quarter,cluster,total_pings,unique_days,unique_hours,zipcode,centroid_latitude,centroid_longitude,log_total_pings,total_days_in_quarter,consistency_score,day_consistency_score,evening_consistency_score,night_consistency_score,time_window_coverage,weekend_focus_score,dominance_score,hour_entropy,max_consecutive_hours,median_income_household_2023,matches_known_address,matched_address,is_main_address_gemini_top5,main_address_justification_gemini_top5,address,search_results,is_main_address_no_residential,main_address_justification_no_residential
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,0,32,3,3,90020,34.065744,-118.296350,3.496508,4,0.750000,1.0,0.5,,0.666667,,0.666667,0.900256,253,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 has the highest dominance score (0.6...,"450 S Kenmore Ave #104, Los Angeles, CA 90020,...","Title: Apartment 450 S Kenmore Ave Apt 412, Lo...",1.0,Cluster 0 was selected as the main home addres...
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1,9,1,1,90002,33.959281,-118.253437,2.302585,4,0.250000,,0.5,,0.333333,,0.187500,,1,56158,0,,0,,"1214 1/2 E 87th St, Los Angeles, CA 90002, USA","Title: 1232 E 87th Pl, Los Angeles, CA 90002 |...",0.0,
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,2,7,1,1,91606,34.182663,-118.383647,2.079442,4,0.250000,,0.5,,0.333333,,0.145833,,1,66884,0,,0,,"6137 Beck Ave, North Hollywood, CA 91606, USA",Title: National Directory of Mental Health Tre...,0.0,
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,0,106,2,12,90020,34.065744,-118.296350,4.672829,3,0.666667,1.0,,0.666667,0.666667,,0.946429,2.257982,87,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 is the better choice for the main ho...,,,,
4,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,3,6,1,2,90020,34.065623,-118.292500,1.945910,3,0.333333,,,0.333333,0.333333,,0.053571,0.450561,4,55832,0,,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158712,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,5,8,1,1,1,90068,34.105830,-118.324080,0.693147,7,0.142857,,1.0,,0.333333,,0.011111,,1,101720,0,,0,,,,,
158713,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,0,6,3,3,90013,34.047980,-118.247367,1.945910,4,0.750000,1.0,,,0.333333,0.666664,0.750000,1.011404,150,32848,1,"400 South Main Street #503, Los Angeles, CA, 9...",1,Cluster 0 is the most likely main address. It ...,,,,
158714,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,7,1,1,1,90017,34.050070,-118.268740,0.693147,4,0.250000,,,1.000000,0.333333,,0.125000,,1,51317,0,,0,,,,,
158715,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,8,1,1,1,90068,34.105830,-118.324080,0.693147,4,0.250000,,1.0,,0.333333,,0.125000,,1,101720,0,,0,,,,,


# BINARY

In [519]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# ----------------------------------
# 1. Load data and pretrained model
# ----------------------------------
df = df_final.copy()
model = lgb.Booster(model_file="lgbm_bayes_main_address_model.txt")

# ----------------------------------
# 2. Define features used for training
# ----------------------------------
features = [
    'day_consistency_score', 'evening_consistency_score', 'night_consistency_score',
    'log_total_pings', 'consistency_score', 'unique_hours', 'max_consecutive_hours',
    'dominance_score', 'time_window_coverage', 'hour_entropy'
]

# ----------------------------------
# 3. Select rows to predict (no trusted label)
# ----------------------------------
df_pred = df[df["is_main_address_no_residential"].isna()].copy()
X_pred = df_pred[features].apply(pd.to_numeric, errors="coerce")

# ----------------------------------
# 4. Predict probability of being main address
# ----------------------------------
df_pred["pred_score"] = model.predict(X_pred)

# ----------------------------------
# 5. Determine top-scoring cluster per CAID + quarter
# ----------------------------------
df_pred["pred_rank"] = df_pred.groupby(["caid", "quarter"])["pred_score"] \
                              .rank(method="dense", ascending=False)
df_pred["pred_is_main"] = (df_pred["pred_rank"] == 1).astype(int)


# ----------------------------------
# 6. Merge predictions back into df_final
# ----------------------------------
df_final = df_final.merge(
    df_pred[["caid", "quarter", "cluster", "pred_score", "pred_rank", "pred_is_main"]],
    on=["caid", "quarter", "cluster"],
    how="left"
)

# Combine trusted labels and model predictions
df_final["final_is_main_address"] = df_final["is_main_address_no_residential"]

# Fill NaNs with predicted labels
df_final.loc[df_final["final_is_main_address"].isna(), "final_is_main_address"] = df_final["pred_is_main"]

# Ensure integer type (0 or 1)
df_final["final_is_main_address"] = df_final["final_is_main_address"].astype(int)

print("✅ Created 'final_is_main_address' combining trusted and predicted labels.")


print("✅ Added pred_score, pred_rank, and pred_is_main to df_final for rows without trusted labels.")


✅ Created 'final_is_main_address' combining trusted and predicted labels.
✅ Added pred_score, pred_rank, and pred_is_main to df_final for rows without trusted labels.


In [524]:
df_final

Unnamed: 0,caid,quarter,cluster,total_pings,unique_days,unique_hours,zipcode,centroid_latitude,centroid_longitude,log_total_pings,total_days_in_quarter,consistency_score,day_consistency_score,evening_consistency_score,night_consistency_score,time_window_coverage,weekend_focus_score,dominance_score,hour_entropy,max_consecutive_hours,median_income_household_2023,matches_known_address,matched_address,is_main_address_gemini_top5,main_address_justification_gemini_top5,is_main_address_no_residential,main_address_justification_no_residential,pred_score,pred_rank,pred_is_main,final_is_main_address
0,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,0,32,3,3,90020,34.065744,-118.296350,3.496508,4,0.750000,1.0,0.5,,0.666667,,0.666667,0.900256,253,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 has the highest dominance score (0.6...,1.0,Cluster 0 was selected as the main home addres...,,,,1
1,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,1,9,1,1,90002,33.959281,-118.253437,2.302585,4,0.250000,,0.5,,0.333333,,0.187500,,1,56158,0,,0,,0.0,,,,,0
2,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,2,2,7,1,1,91606,34.182663,-118.383647,2.079442,4,0.250000,,0.5,,0.333333,,0.145833,,1,66884,0,,0,,0.0,,,,,0
3,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,0,106,2,12,90020,34.065744,-118.296350,4.672829,3,0.666667,1.0,,0.666667,0.666667,,0.946429,2.257982,87,55832,1,"450 S. Kenmore Avenue #112, Los Angeles, CA, 9...",1,Cluster 0 is the better choice for the main ho...,,,0.995084,1.0,1.0,1
4,000c2d116598ea942c398285b59f0e8ee465d200810bfa...,5,3,6,1,2,90020,34.065623,-118.292500,1.945910,3,0.333333,,,0.333333,0.333333,,0.053571,0.450561,4,55832,0,,0,,,,0.054602,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158712,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,5,8,1,1,1,90068,34.105830,-118.324080,0.693147,7,0.142857,,1.0,,0.333333,,0.011111,,1,101720,0,,0,,,,0.009122,4.0,0.0,0
158713,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,0,6,3,3,90013,34.047980,-118.247367,1.945910,4,0.750000,1.0,,,0.333333,0.666664,0.750000,1.011404,150,32848,1,"400 South Main Street #503, Los Angeles, CA, 9...",1,Cluster 0 is the most likely main address. It ...,,,0.980089,1.0,1.0,1
158714,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,7,1,1,1,90017,34.050070,-118.268740,0.693147,4,0.250000,,,1.000000,0.333333,,0.125000,,1,51317,0,,0,,,,0.102261,2.0,0.0,0
158715,fffc5273eff908ea2b2f0575cba9456bfb8a297f250d97...,8,8,1,1,1,90068,34.105830,-118.324080,0.693147,4,0.250000,,1.0,,0.333333,,0.125000,,1,101720,0,,0,,,,0.019133,3.0,0.0,0


In [527]:
# Ensure both columns are integer type for comparison
df_final["is_main_address_gemini_top5"] = df_final["is_main_address_gemini_top5"].fillna(0).astype(int)
df_final["final_is_main_address"] = df_final["final_is_main_address"].astype(int)

# Total rows where either column is not null
valid_rows = df_final[["is_main_address_gemini_top5", "final_is_main_address"]].dropna()

# Compute disagreement
mismatch = (valid_rows["is_main_address_gemini_top5"] != valid_rows["final_is_main_address"]).sum()
total = len(valid_rows)

# Percentage mismatch
percent_mismatch = 100 * mismatch / total

print(f"🔍 Mismatch between Gemini and Final labels: {mismatch} out of {total} rows")
print(f"❌ Percentage mismatch: {percent_mismatch:.2f}%")


🔍 Mismatch between Gemini and Final labels: 5633 out of 158717 rows
❌ Percentage mismatch: 3.55%


In [523]:
df_final.drop(columns=["address","search_results"], inplace=True)

In [528]:
df_final.to_csv("../datasets/predicted_data_with_main_address_per_user.csv", index=False)

In [529]:
df_final.to_excel("../datasets/predicted_data_with_main_address_per_user.xlsx", index=False)