In [1]:
# !pip install snorkel
# !pip install pandas

In [2]:
import pandas as pd

train_df = pd.read_pickle("./train_fake_jobs.pkl")
test_df = pd.read_pickle("./test_fake_jobs.pkl")

In [3]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

In [4]:
FAKE = 1
REAL = 0
ABSTAIN = -1

In [5]:
FAKE_COMPANIES = [
    "Aker Solutions",
    "Aptitude Staffing Solutions",
    "Gary Cartwright",
    "Edison International and Refined Resources",
    "Le Meridien",
]

In [6]:
@labeling_function()
def no_requirements(x: pd.Series):

    return FAKE if x.requirements == "" else ABSTAIN


@labeling_function()
def requirements_less_than_10(x: pd.Series):
    num_words = len(x.requirements.split(" "))
    return FAKE if num_words <= 10 else ABSTAIN


@labeling_function()
def requirements_less_than_20(x: pd.Series):
    num_words = len(x.requirements.split(" "))
    return FAKE if num_words <= 20 else ABSTAIN


@labeling_function()
def no_company_profile(x: pd.Series):
    return FAKE if x.company_profile == "" else ABSTAIN


@labeling_function()
def no_company_logo(x: pd.Series):
    return FAKE if x.has_company_logo == 0 else ABSTAIN


@labeling_function()
def suspicious_company(x: pd.Series):
    return (
        FAKE
        if any(
            company.lower() in x.company_profile.lower() for company in FAKE_COMPANIES
        )
        else ABSTAIN
    )


@labeling_function()
def has_background_check(x: pd.Series):
    return REAL if "background check" in x.requirements else ABSTAIN


@labeling_function()
def required_experience(x: pd.Series):
    return REAL if pd.notna(x.required_experience) else ABSTAIN


@labeling_function()
def required_education(x: pd.Series):
    return REAL if pd.notna(x.required_education) else ABSTAIN

In [7]:
lfs = [
    no_company_profile,
    suspicious_company,
    no_company_logo,
    has_background_check,
    required_experience,
    required_education,
]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train_df)

100%|██████████| 13410/13410 [00:02<00:00, 5811.07it/s]


In [13]:
# 이거 왜 에러 걸리지?
# LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=train_df.fraudulent.values)

In [14]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(L_train[:, 2], L_train[:, 4])

res = train_df.iloc[buckets[(FAKE, REAL)]].sample(10, random_state=1)[
    ["has_company_logo", "required_experience", "fraudulent"]
]

res

Unnamed: 0,has_company_logo,required_experience,fraudulent
16877,0,Mid-Senior level,0
17068,0,Mid-Senior level,0
16816,0,Mid-Senior level,0
12186,0,Not Applicable,0
16808,0,Mid-Senior level,0
678,0,Not Applicable,0
3706,0,Entry level,0
14150,0,Entry level,0
2199,0,Entry level,0
4411,0,Entry level,0


In [15]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [16]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=1)

In [17]:
L_test = applier.apply(df=test_df)

100%|██████████| 4470/4470 [00:00<00:00, 5838.99it/s]


In [18]:
Y_train = train_df["fraudulent"]
Y_test = test_df["fraudulent"]

In [19]:
majority_acc = majority_model.score(
    L=L_test,
    Y=Y_test,
)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   80.8%


In [20]:
label_model_acc = label_model.score(L=L_test, Y=Y_test)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     71.9%
