In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv("../data/filteredvehpub.csv")   # adjust path as needed
df.head()
df.shape

(256115, 13)

In [24]:
import numpy as np
invalid_values = [-9, -8, -7, -88, 99, "99", "XX", "xx", "XX ", "-88", "-9", "-8", "-7"]
df = df.replace(invalid_values, np.nan)
df = df.dropna()

In [26]:
#df["VEHTYPE"] = df["VEHTYPE"].replace([5, 6], np.nan)
#df = df.dropna()

In [28]:
counts = df["MAKE"].value_counts()
counts.describe()

count       53.000000
mean      4619.924528
std       8184.317383
min         59.000000
25%        286.000000
50%       1584.000000
75%       4649.000000
max      34870.000000
Name: count, dtype: float64

In [30]:
df["MAKE"] = df["MAKE"].astype(str)   # convert everything to string first
df["MAKE"] = df["MAKE"].str.strip()   # remove whitespace
df["MAKE"] = df["MAKE"].astype(int)   # convert to integer

counts = df["MAKE"].value_counts()
rare_makes = counts[counts < 1000].index

df["MAKE"] = df["MAKE"].where(~df["MAKE"].isin(rare_makes), 98)
df["MAKE"].value_counts()

MAKE
12    34870
49    33679
20    31287
37    23895
7     12576
35    11706
98     8243
2      7309
48     6945
23     6915
55     6773
59     4980
18     4867
6      4713
63     4649
30     4622
34     4545
41     4340
42     3651
72     3094
19     2667
22     2665
54     2540
51     1832
14     1800
13     1789
24     1584
32     1495
58     1420
52     1240
53     1161
76     1004
Name: count, dtype: int64

In [32]:
hh_makes = (
    df.groupby("HOUSEID")["MAKE"]
      .apply(lambda s: sorted(set(s)))   # unique, sorted list of makes
      .reset_index(name="MAKE_LIST")
)

hh_makes.head()

Unnamed: 0,HOUSEID,MAKE_LIST
0,30000007,"[19, 20, 49]"
1,30000008,[20]
2,30000012,"[12, 58]"
3,30000019,"[37, 98]"
4,30000029,"[20, 49]"


In [35]:
household_feature_cols = [
    "HOUSEID",
    "HHSIZE",
    "HHFAMINC",
    "LIF_CYC",
    "CENSUS_R",
    "HH_RACE",
    "HOMEOWN",
    "WRKCOUNT",
    "URBAN",
    "URBANSIZE",
    "DRVRCNT"
]

hh_feat = df[household_feature_cols].drop_duplicates("HOUSEID")

hh = hh_feat.merge(hh_makes, on="HOUSEID")
hh.head()

Unnamed: 0,HOUSEID,HHSIZE,HHFAMINC,LIF_CYC,CENSUS_R,HH_RACE,HOMEOWN,WRKCOUNT,URBAN,URBANSIZE,DRVRCNT,MAKE_LIST
0,30000007,3,7.0,10.0,3,2.0,1.0,1,1,1,3,"[19, 20, 49]"
1,30000008,2,8.0,2.0,2,1.0,1.0,2,4,6,2,[20]
2,30000012,1,10.0,1.0,1,1.0,1.0,1,1,3,1,"[12, 58]"
3,30000019,2,3.0,2.0,3,1.0,1.0,0,1,1,2,"[37, 98]"
4,30000029,2,5.0,10.0,2,1.0,1.0,0,1,2,2,"[20, 49]"


In [37]:
feature_cols = [
    "HHSIZE",
    "HHFAMINC",
    "LIF_CYC",
    "CENSUS_R",
    "HH_RACE",
    "HOMEOWN",
    "WRKCOUNT",
    "URBAN",
    "URBANSIZE",
    "DRVRCNT"
]

X = hh[feature_cols]
y_list = hh["MAKE_LIST"]   # list of strings per row

In [39]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_list)

print("Classes:", mlb.classes_)
print("Y shape:", Y.shape)  # (n_households, n_makes)

Classes: [ 2  6  7 12 13 14 18 19 20 22 23 24 30 32 34 35 37 41 42 48 49 51 52 53
 54 55 58 59 63 72 76 98]
Y shape: (118653, 32)


In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=42
)

In [43]:
numeric_features = ["HHSIZE", "HHFAMINC", "WRKCOUNT", "DRVRCNT"]
categorical_features = ["LIF_CYC", "CENSUS_R", "HH_RACE", "HOMEOWN", "URBAN", "URBANSIZE"]

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

base_rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    min_samples_split=4,
    min_samples_leaf=2
)

multi_rf = OneVsRestClassifier(base_rf, n_jobs=-1)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("clf", multi_rf)
])

In [47]:
pipe.fit(X_train, Y_train)

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,RandomForestC...ndom_state=42)
,n_jobs,-1
,verbose,0

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,4
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
# Access the fitted classifier and preprocessing
clf_step = pipe.named_steps["clf"]
X_test_trans = pipe.named_steps["preprocess"].transform(X_test)

# OneVsRestClassifier gives a list of estimators, each with predict_proba
probs_per_class = np.column_stack([
    est.predict_proba(X_test_trans)[:, 1]   # P(has this make)
    for est in clf_step.estimators_
])

probs_per_class.shape  # (n_samples, n_classes)

(23731, 32)

In [53]:
top1_idx = np.argmax(probs_per_class, axis=1)       # index of best make per row
top1_makes = mlb.classes_[top1_idx]                 # optional: actual make names

top1_makes[:10]

array([12, 49, 49, 49, 20, 12, 12, 35, 20, 12])

In [55]:
correct_flags = []

for i in range(Y_test.shape[0]):
    # 1 if predicted make is actually one of the household's makes
    correct_flags.append(Y_test[i, top1_idx[i]] == 1)

top1_in_set_accuracy = np.mean(correct_flags)
print("Top-1-in-set accuracy:", top1_in_set_accuracy)

Top-1-in-set accuracy: 0.2567527706375627


In [59]:
all_makes_flat = [m for makes in y_list for m in makes]
most_common_make = pd.Series(all_makes_flat).value_counts().idxmax()
print("Most common make:", most_common_make)
baseline_idx = np.where(mlb.classes_ == most_common_make)[0][0]
print("Index in ML-binarizer:", baseline_idx)
correct_flags = []

for i in range(Y_test.shape[0]):
    correct_flags.append(Y_test[i, baseline_idx] == 1)

baseline_top1_in_set = np.mean(correct_flags)
print("Baseline Top-1-in-set Accuracy:", baseline_top1_in_set)

Most common make: 12
Index in ML-binarizer: 3
Baseline Top-1-in-set Accuracy: 0.23450339218743416
