In [1]:
from snorkel.labeling import labeling_function 
import pandas as pd
import pickle

In [2]:
from sklearn.model_selection import train_test_split

# In this case X are the lines and y is the output (A heading = 1, No heading = 0)
df = pd.read_csv('csvs/total_df.csv', index_col=0)
df = df.dropna()
X_train, X_test, y_train, y_test = train_test_split(df['parsed_line'], df['heading_bool'], test_size=0.2, random_state=42)
df_train_splitted, df_test_splitted = train_test_split(df, test_size=0.2)


In [3]:
# For clarity, we define constants to represent the class labels for spam, ham, and abstaining.
TEXT = 0
HEADER = 1
# TODO: Later use? 
EQUATION = 2
NOTABLE = -1
@labeling_function()
def contains_abstract(x):
    # print(x)
    try:
        return HEADER if "abstract" in x.text.lower() else TEXT
    except AttributeError:
        return NOTABLE

In [4]:
#Apply the defined labeling functions to our train data
from snorkel.labeling import PandasLFApplier

# Add labeling functions to list
lfs = [contains_abstract]

applier = PandasLFApplier(lfs=lfs)
df_train = df_train_splitted.rename(columns={'parsed_line': 'text'})
# df_train = pd.DataFrame()
# df_train['text'] = X_train
# df_train['HeadingBool'] = y_train
df_train['text'].astype("string")
df_train = df_train.dropna()
L_train = applier.apply(df=df_train)
df_train

100%|██████████| 4560/4560 [00:00<00:00, 61523.01it/s]


Unnamed: 0,text,heading_bool,source_paper,math_bool,latin_numbered_bool,numberic_numbered_bool
13916,n=1exp−6∗6n\n,False,Language_Modeling_With_Dynamic_Bayesian_Networ...,False,False,True
16478,better substitution generation than PPDB and S...,False,LSBert A Simple Framework for Lexical,False,False,True
20737,diﬀerent conﬁgurations and customer needs. It ...,False,sbbd_shp_07,False,False,True
1210,The field of artificial intelligence (AI) in c...,False,29020-59012-1-PB,False,False,True
14582,nine of the subsets should be combined to form...,False,lfw_paper,False,False,False
...,...,...,...,...,...,...
21422,.\n,False,Systematic Design of a Transimpedance Amplifier,False,False,False
3114,with a CMRR that is not affected by inequaliti...,False,A_Broadband_High_Common_Mode_Rejection_Ratio_I...,False,False,False
5618,effectiveness of the proposed method on sentim...,False,Context-Based_Feature_Technique_for_Sarcasm_Id...,False,False,False
12839,such as AI have become useful in identifying c...,False,ijerph-18-05686-v2,False,False,True


In [5]:
coverage_abstract = (L_train).mean(axis=0)
coverage_abstract
# print(f"check coverage: {coverage_abstract * 100:.1f}%")
# L_train

array([0.00328947])

In [6]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
contains_abstract,0,"[0, 1]",1.0,0.0,0.0


In [7]:
#Show some examples where a labeling function decides its a spam comment
# After seeing this we should change the function to if the sentence starts with the word "abstract"
# df_found_headers = df_train.iloc[L_train == NOTABLE]
# correct = len(df_found_headers[df_found_headers['HeadingBool'] == 1])
# all = len(df_found_headers)
# print(correct/all)  
# df_found_headers

### Try to improve it:

In [23]:
from snorkel.preprocess import preprocessor
from textblob import TextBlob
import re

@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

# TODO: Check if usefull
# @labeling_function(pre=[textblob_sentiment])
# def textblob_polarity(x):
#     return SPAM if x.polarity > 0.9 else ABSTAIN
  
# @labeling_function(pre=[textblob_sentiment])
# def textblob_subjectivity(x):
#     return HAM if x.subjectivity >= 0.5 else ABSTAIN

@labeling_function()
def starts_with_int_and_dot(x):
    """Biggest labeling functions, gets out most of the headers that start with numberic numbering'"""
    def has_numbers(inputString):
        return bool(re.search(r'\d', inputString))
    for i in range(1, 12, 1):
        startwith_str = str(i) + ". "
        if x.text.lower().startswith(startwith_str) and "," not in x.text.lower():
            if has_numbers(x.text.lower().split(startwith_str)[1]):
                return TEXT
            else:
                return HEADER
    return TEXT 

@labeling_function()
def starts_with_latinint_and_dot(x):
    """Do the same for romain/latin numbering'"""
    def has_numbers(inputString):
        return bool(re.search(r'\d', inputString))
    for latin_int_str in ['I.', 'II.', 'III.', 'IV.', 'V.', 'VI.', 'VII.', 'VIII.', 'IX.', 'X.', 'XI.']:
        startwith_str = latin_int_str
        if x.text.startswith(startwith_str) and "," not in x.text:
            return HEADER
    return TEXT

@labeling_function()
def starts_with_latinint(x):
    """Do the same for romain/latin numbering without .'"""
    def has_numbers(inputString):
        return bool(re.search(r'\d', inputString))
    for latin_int_str in ['I ', 'II ', 'III ', 'IV ', 'V ', 'VI ', 'VII ', 'VIII ', 'IX ', 'X ', 'XI ']:
        startwith_str = latin_int_str
        if x.text.startswith(startwith_str) and "," not in x.text:
            return HEADER
    return TEXT

@labeling_function()
def start_int_look_for_words(x):
    header_words = ['introduction', 'discussion', 'conclusion', 'i ntroduction', 'd iscussion', 'c onclusion']
    """Biggest labeling functions, gets out most of the headers that start with numberic numbering'"""
    def has_numbers(inputString):
        return bool(re.search(r'\d', inputString))
    for i in range(1, 12, 1):
        startwith_str = str(i) + " "
        if x.text.lower().startswith(startwith_str) and "," not in x.text.lower():
            for header_word in header_words:
                if header_word in x.text.lower().split(startwith_str)[1]:
                    return HEADER
    return TEXT 
    

# @labeling_function()
# def starts_with_romainInt_and_dot(x):
#     """Ham comments are often short, such as 'cool video!'"""
#     try:
#         return HEADER if x.text.lower().startswith("I. ") else TEXT
#     except AttributeError:
#         return NOTABLE

@labeling_function()
def start_abstract(x):
    try:
        return HEADER if x.text.lower().startswith("abstract\n") or x.text.lower().startswith("abstract ") or x.text.lower().startswith("abstract-") or x.text.lower().startswith("abstract.") else TEXT
    except AttributeError:
        return NOTABLE

# @labeling_function()
# def start_introduction(x):
#     try:
#         return HEADER if x.text.lower().startswith("introduction") else TEXT
#     except AttributeError:
#         return NOTABLE

# @labeling_function() 
# def please_comment(x):
#     return SPAM if any(word in x.text.lower() for word in ['please', 'pls', 'plz']) else ABSTAIN

In [24]:
df_test = df_test_splitted.rename(columns={'parsed_line' : 'text'})
# df_test['text'] = X_test
# df_test['HeadingBool'] = y_test
df_test['text'].astype("string")
df_test = df_test.dropna()
# lfs = [starts_with_int_and_dot, starts_with_latinint_and_dot, starts_with_romainInt_and_dot, start_abstract, start_introduction]
lfs = [starts_with_int_and_dot, starts_with_latinint_and_dot, start_abstract, start_int_look_for_words, starts_with_latinint]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|██████████| 4560/4560 [00:01<00:00, 2919.14it/s]
100%|██████████| 4560/4560 [00:01<00:00, 2987.50it/s]


In [30]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()
df_found_headers = df_train.iloc[L_train == HEADER]
# # df_found_headers
print(len(df_train[df_train['heading_bool'] == 1]))
print(len(df_found_headers[df_found_headers['heading_bool'] == 1]))
# df_found_headers[df_found_headers['heading_bool'] == 0]
df_train[df_train['heading_bool'] == 1].sample(10)
df_found_headers[df_found_headers['heading_bool'] == 0].sample(10)
# len(df[df['heading_bool'] == 0])
# df[df['heading_bool'] == 0]
len(df_train[df_train['heading_bool'] == 1])

85
40


85

In [11]:
#Train the Snorkel Label Model (Noisy Labels as Input)
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=8.402]
  8%|▊         | 38/500 [00:00<00:01, 375.22epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.005]
 31%|███       | 155/500 [00:00<00:00, 835.40epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.002]
INFO:root:[300 epochs]: TRAIN:[loss=0.001]
 60%|██████    | 301/500 [00:00<00:00, 1043.63epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.001]
100%|██████████| 500/500 [00:00<00:00, 1037.23epoch/s]
INFO:root:Finished Training


In [12]:
#Evaluate Label Model Performance
label_model_acc = label_model.score(L=L_test, Y=y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     98.2%


In [158]:
#Preprocessing for model training
from snorkel.labeling import filter_unlabeled_dataframe
from sklearn.feature_extraction.text import CountVectorizer
from snorkel.utils import probs_to_preds

probs_train = label_model.predict_proba(L=L_train)

#Use only training data for which labels have been created
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [159]:
print(f"Number of train instances: {X_train.shape[0]}")

Number of train instances: 66732


In [160]:
#Train a simple regression model

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier().fit(X=X_train, y=preds_train_filtered)


ValueError: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.

In [None]:
print(f"Test Accuracy: {gb_model.score(X=X_test, y=y_test) * 100:.1f}%")

Test Accuracy: 98.2%


In [None]:
preds_train_filtered

array([0, 0, 0, ..., 0, 0, 0])