Data Cleaning

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [4]:
data_path = Path('census-bureau.data')
col_path  = Path('census-bureau.columns')

cols = [line.strip() for line in col_path.read_text(errors='ignore').splitlines() if line.strip()]

cols_clean = (pd.Series(cols).str.strip().str.lower().str.replace(r"[^a-z0-9]+", "_", regex=True).tolist())


df = pd.read_csv(data_path,header=None,names=cols_clean,skipinitialspace=True) 

if 'label' in df.columns:
    df['label'] = df['label'].astype(str).str.strip().str.rstrip('.')

print(df.shape)
df.head()

(199523, 42)


Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran_s_admin,veterans_benefits,weeks_worked_in_year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000


In [5]:
df.columns

Index(['age', 'class_of_worker', 'detailed_industry_recode',
       'detailed_occupation_recode', 'education', 'wage_per_hour',
       'enroll_in_edu_inst_last_wk', 'marital_stat', 'major_industry_code',
       'major_occupation_code', 'race', 'hispanic_origin', 'sex',
       'member_of_a_labor_union', 'reason_for_unemployment',
       'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses',
       'dividends_from_stocks', 'tax_filer_stat',
       'region_of_previous_residence', 'state_of_previous_residence',
       'detailed_household_and_family_stat',
       'detailed_household_summary_in_household', 'weight',
       'migration_code_change_in_msa', 'migration_code_change_in_reg',
       'migration_code_move_within_reg', 'live_in_this_house_1_year_ago',
       'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer',
       'family_members_under_18', 'country_of_birth_father',
       'country_of_birth_mother', 'country_of_birth_self', 'citizenship',
       'ow

Data Processing - Missing value handle

In [6]:
# convert label to 1/0
df['label'] = (df['label'] == '50000+').astype(int)
df.replace("?", np.nan, inplace=True)

na_cols = df.columns[df.isna().any()].tolist()

print('Column with missing values',na_cols)

Column with missing values ['hispanic_origin', 'state_of_previous_residence', 'migration_code_change_in_msa', 'migration_code_change_in_reg', 'migration_code_move_within_reg', 'migration_prev_res_in_sunbelt', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self']


In [7]:
df[na_cols] = df[na_cols].fillna('unknown')

In [10]:
(df[na_cols] == "unknown").sum().sum()

416591

Data Preprocessing for baseline model

In [7]:
# use the year to split the data - 94(train) and 95(test)
from sklearn.model_selection import train_test_split

y = df['label']
w = df['weight']

X = df.drop(columns=['label','weight'])

# build the test set
test_mask = X['year'] == 95

X_test = X.loc[test_mask].drop(columns=['year'])
y_test = y.loc[test_mask]
w_test = w.loc[test_mask]

# build the train test
trainval_mask = X['year'] == 94

X_94 = X.loc[trainval_mask].drop(columns=["year"])
y_94 = y.loc[trainval_mask]
w_94 = w.loc[trainval_mask]

X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(X_94, y_94, w_94,test_size=0.2,
                                                                  random_state=42,stratify=y_94)

print('Train:',X_train.shape, 'Val:',X_val.shape,'Test(95)', X_test.shape)
print('Train y%', y_train.value_counts(normalize=True).round(4))
print('Val y%', y_val.value_counts(normalize=True).round(4))
print('Test y%', y_test.value_counts(normalize=True).round(4))


Train: (79861, 39) Val: (19966, 39) Test(95) (99696, 39)
Train y% label
0    0.9415
1    0.0585
Name: proportion, dtype: float64
Val y% label
0    0.9415
1    0.0585
Name: proportion, dtype: float64
Test y% label
0    0.9344
1    0.0656
Name: proportion, dtype: float64


Baseline Model

In [9]:
numerical_col = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_col = [col for col in X_train.columns if col not in numerical_col]

numerical_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('sclar', StandardScaler(with_mean=False))
])

cate_tran = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',  sparse_output=True))
])

preprocess = ColumnTransformer(transformers=[
    ('num',numerical_tran,numerical_col),
    ('cat',cate_tran,categorical_col),],
    remainder='drop')

preprocess.fit(X_train)
n_features = preprocess.transform(X_train.iloc[:5]).shape[1]
print('Fianl feature count', n_features)

log_reg = LogisticRegression()

baseline_model = Pipeline(steps=[
    ('preporcess',preprocess),
    ('logreg',log_reg)
])

baseline_model.fit(X_train,y_train,logreg__sample_weight = w_train)


proba_val = baseline_model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.5).astype(int)

print("\nWeighted validation metrics (LogReg baseline):")
print("accuracy :", accuracy_score(y_val, pred_val, sample_weight=w_val))
print("precision:", precision_score(y_val, pred_val, sample_weight=w_val))
print("recall   :", recall_score(y_val, pred_val, sample_weight=w_val))
print("f1       :", f1_score(y_val, pred_val, sample_weight=w_val))
print("roc_auc  :", roc_auc_score(y_val, proba_val, sample_weight=w_val))

cm = confusion_matrix(y_val, pred_val, sample_weight=w_val)
print("\nWeighted confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)

Fianl feature count 389

Weighted validation metrics (LogReg baseline):
accuracy : 0.9548147958276519
precision: 0.7283222226373169
recall   : 0.4084099405205401
f1       : 0.5233493786271743
roc_auc  : 0.9442860360723193

Weighted confusion matrix [[TN, FP],[FN, TP]]:
[[32035699.94000005   318739.25      ]
 [ 1237740.37         854486.08      ]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Try to tune the decision to max out threshold for F1