# Snorkel on IMDb

In [1]:
%pip install snorkel datasets scikit-learn pandas

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0


# 1. Load and Explore the IMDb Dataset

In [2]:
from datasets import load_dataset
import pandas as pd

# Load 2000 training and 500 test examples for speed​

imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"].select(range(2000)))
test = pd.DataFrame(imdb["test"].select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train size: 2000 Test size: 500


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


#2. Preprocess Text

In [3]:
import re
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()

train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)


#3. Define Labeling Functions (LFs)

Create simple heuristics:

  -  LF_positive: labels text as positive if it contains strong positive words.​​
   - LF_negative: labels text as negative if it contains strong negative words
  -  LF_exclaim: positive if contains “!” more than 2 times



In [4]:
from snorkel.labeling import labeling_function, LFAnalysis
# from snorkel.labeling import LabelModel
from snorkel.labeling.model.label_model import LabelModel

ABSTAIN, NEG, POS = -1, 0, 1
positive_words = {"great","excellent","amazing","wonderful","best","fantastic"}
negative_words = {"bad","terrible","awful","worst","boring","poor"}

@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN
lfs = [lf_positive, lf_negative, lf_exclaim]

# Analyze LF Coverage & Conflicts

In [5]:
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)
LFAnalysis(L_train, lfs).lf_summary()

100%|██████████| 2000/2000 [00:00<00:00, 3421.70it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_positive,0,[1],0.321,0.1795,0.1795
lf_negative,1,[0],0.5635,0.1795,0.1795
lf_exclaim,2,[],0.0,0.0,0.0


#4. Train the LabelModel

In [6]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get probabilistic labels​
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

100%|██████████| 500/500 [00:00<00:00, 857.51epoch/s]



#5. Train an End-to-End Classifier

Use a simple logistic regression on TF-IDF features:


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize​

vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds

# Fit classifier​

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluate on test set​

X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["neg","pos","neu"]))

              precision    recall  f1-score   support

         neg       0.00      0.00      0.00         0
         pos       1.00      0.81      0.90       500
         neu       0.00      0.00      0.00         0

    accuracy                           0.81       500
   macro avg       0.33      0.27      0.30       500
weighted avg       1.00      0.81      0.90       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
