In [1]:
import srsly
import os
os.chdir('..')

ModuleNotFoundError: No module named 'srsly'

In [2]:
import spacy
nlp = spacy.blank('en')
ruler = nlp.add_pipe('entity_ruler')
ruler.from_disk('assets/patterns.jsonl')

ModuleNotFoundError: No module named 'spacy'

In [3]:
from collections import Counter
from tqdm.notebook import tqdm
from pathlib import Path
from typing import List, Dict
import pandas as pd


def load_data(input_path: Path) -> List[Dict]:
    data = []
    items = srsly.read_jsonl(input_path)
    text_items = ((item["text"], item) for item in items)
    for doc, item in tqdm(nlp.pipe(text_items, as_tuples=True)):
        example = {}
        example["text"] = item["text"]
        example["y"] = item["label"]
        example.update(Counter([ent.label_ for ent in doc.ents]))
        data.append(example)

    df = pd.DataFrame(data)

    df["N_ENTS"] = df[["WIKI", "ENVO", "LINKEDSDG", "INDUSTRY", "SUBSTANCE"]].sum(
        axis=1
    )
    df["LEN"] = df.text.str.len()
    df["ENT/CHAR"] = df["N_ENTS"] / df["LEN"]

    return df


In [4]:
df = load_data('corpus/sentences_train.jsonl')
df.sort_values('ENT/CHAR')

0it [00:00, ?it/s]

Unnamed: 0,text,y,WIKI,LINKEDSDG,ENVO,INDUSTRY,SUBSTANCE,N_ENTS,LEN,ENT/CHAR
4288,PoliticsThe Biden PresidencyFacts FirstBusines...,,3.0,,,,,3.0,774,0.003876
5821,Find Us on FacebookWohltätige Spenden„Spenden“...,,5.0,,,,,5.0,928,0.005388
451,CatalogMenuThis page in:EnglishEspañolFrançais...,,,2.0,,,,2.0,298,0.006711
5146,Lecker dank Frost Wunderlauch Haselnüsse samme...,,1.0,,1.0,,,2.0,237,0.008439
5380,BBC HomepageSkip to contentAccessibility HelpY...,,3.0,,,,1.0,4.0,458,0.008734
...,...,...,...,...,...,...,...,...,...,...
729,Hosting a Blood Drive,,2.0,,1.0,,,3.0,21,0.142857
3827,Leave a Reply,,2.0,,,,,2.0,13,0.153846
2590,A decade of rising poverty,,3.0,1.0,,,,4.0,26,0.153846
732,Hosting a Blood Drive FAQ,,3.0,,1.0,,,4.0,25,0.160000


In [5]:
df_dev = load_data('corpus/sentences_dev.jsonl')

0it [00:00, ?it/s]

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

In [7]:
def build_input(df):
    X = df[['WIKI', 'ENVO', 'LINKEDSDG', 'INDUSTRY', 'SUBSTANCE', 'N_ENTS', 'ENT/CHAR', 'LEN']].fillna(0)
    y = (df.y == 'INTERESTING').astype(int)
    return X, y

import numpy as np
X, y = build_input(df)
class_weights = dict(zip([0, 1], len(y) / (2 * np.bincount(y))))
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight=class_weights))
clf.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc',
                 SVC(class_weight={0: 0.5313115047414564, 1: 8.484285714285715},
                     gamma='auto'))])

In [8]:
from sklearn.metrics import precision_recall_fscore_support
X_dev, y_dev = build_input(df_dev)
precision_recall_fscore_support(clf.predict(X_dev), y_dev)

(array([0.43658434, 0.66442953]),
 array([0.95425435, 0.06851211]),
 array([0.59908099, 0.12421581]),
 array([1093, 1445]))