In [1]:
import os
import time
import json
import utils
import parse
import fasttext
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

suitable_path = "/home/peterr/macocu/task5_webgenres/data/original/dataset/dataset.json"
nonsuitable_path = "/home/peterr/macocu/task5_webgenres/data/original/dataset/not_suitable_dataset.json"

train_path = "/home/peterr/macocu/task5_webgenres/data/interim/train_1d.csv"
test_path = "/home/peterr/macocu/task5_webgenres/data/interim/test_1d.csv"
dev_path = "/home/peterr/macocu/task5_webgenres/data/interim/dev_1d.csv"

train_ids = pd.read_csv(train_path)["id"].unique().tolist()
test_ids = pd.read_csv(test_path)["id"].unique().tolist()
dev_ids = pd.read_csv(dev_path)["id"].unique().tolist()


with open(suitable_path) as f:
    suitable_content = json.load(f)
with open(nonsuitable_path) as f:
    nonsuitable_content = json.load(f)

def first_downcasting(s: str)->str:
    first = {"Script/Drama":"Other", "Lyrical":"Other","FAQ":"Other"}
    return first.get(s, s)
def second_downcasting(s: str)->str:
    second = {"Recipe":"Instruction", "Research Article":"Information/Explanation", "Review":"Opinion/Argumentation", "Promotion of Services":"Promotion", "Promotion of a Product":"Promotion", "Invitation":"Promotion", "Correspondence":"Other", "Prose":"Other", "Call":"Other"}
    return second.get(s, s)
def get_domain(url:str) -> str:
    pattern = "{protocol}://{domain}/{rest}"
    p = parse.compile(pattern)
    modified_pattern = "{protocol}://{domain}/"
    mp = parse.compile(modified_pattern)

    try:
        parse_result = p.parse(url)
        domain = parse_result["domain"]
    except TypeError:
        parse_result = mp.parse(url)
        domain = parse_result["domain"]
    return domain

def assign_split(doc_id: str) -> str:
    if doc_id in train_ids:
        return "train"
    elif doc_id in dev_ids:
        return "dev"
    elif doc_id in test_ids:
        return "test"
    else:
        raise ValueError(f"ID {doc_id} was not found in train, dev, or test split!")



In [3]:
counter = 0
for i in suitable_content:
    if i["primary"] == "List of Summaries/Excerpts":
        if i["secondary"] != "":
            counter += 1
print(counter)

57


# Suitable DS

In [4]:
typo_dict = {
    "Promotion of services": "Promotion of Services",
    "Opinionated news": "Opinionated News",
    "Research article": "Research Article",
    "Promotion of a product": "Promotion of a Product"
}
for doc in suitable_content:
    # Manual intervention as agreed 2021-11-07:
    if doc["primary"] == "List of Summaries/Excerpts":
        doc["secondary"] = ""
    # Adding downcasted labels
    for label in "primary,secondary,tertiary".split(","):
        p = doc[label]
        p = typo_dict.get(p, p)
        doc[f"{label}_level_1"] = p
        doc[f"{label}_level_2"] = first_downcasting(p)
        doc[f"{label}_level_3"] = second_downcasting(first_downcasting(p))
        del doc[label]
    # Adding split
    doc["split"] = assign_split(doc["id"])
    # Adding domain:
    doc["domain"] = get_domain(doc["url"])
    
    # Validation
    assert len(doc["paragraphs"]) > 0, "Paragraphs is an empty list."
    assert (doc["primary_level_2"] in utils.list_of_categories_matrix) or (doc["primary_level_2"] == ""), f"Weird: {doc['primary_level_2']}"
    assert doc["url"] != "" and isinstance(doc["url"], str)





Let's check the first element:

In [5]:
suitable_content[0]

{'id': '3949',
 'url': 'http://www.pomurje.si/aktualno/sport/zimska-liga-malega-nogometa/',
 'crawled': '2014',
 'hard': False,
 'paragraphs': [{'text': 'Šport', 'duplicate': False, 'keep': True},
  {'text': 'Zimska liga malega nogometa sobota, 12.02.2011',
   'duplicate': False,
   'keep': True},
  {'text': 'avtor: Tonček Gider', 'duplicate': False, 'keep': True},
  {'text': "V 7. krogu zimske lige v malem nogometu v Križevcih pri Ljutomeru je v prvi ligi vodilni 100 plus iz Križevec izgubil s tretjo ekipo na lestvici Rock'n roll iz Križevec z rezultatom 1:2, druga na lestvici Top Finedika iz Križevec je bila poražena z ekipo Bar Milene iz Ključarovec z rezultatom 7:8. V drugi križevski ligi je vodilni Cafe del Mar iz Vučje vasi premagal Montažo Vrbnjak iz Stare Nove vasi z rezultatom 3:2.",
   'duplicate': False,
   'keep': True},
  {'text': 'oglasno sporočilo', 'duplicate': False, 'keep': True},
  {'text': 'Ocena', 'duplicate': False, 'keep': True},
  {'text': 'Komentiraj Za komenti

Creating the directory for the dataset

In [6]:
!mkdir /home/peterr/macocu/task5_webgenres/data/finished_dataset

mkdir: cannot create directory ‘/home/peterr/macocu/task5_webgenres/data/finished_dataset’: File exists


Writing the results to a file:

In [7]:
with open("/home/peterr/macocu/task5_webgenres/data/finished_dataset/suitable.json", "w") as f:
    json.dump(suitable_content, f)

# Nonsuitable

First we need to perform a train, dev, test split. We will perform a train:devtest split and then dev:test split. We will stratify the splitting by labels

In [49]:
df = pd.DataFrame(nonsuitable_content)
df.head()

Unnamed: 0,id,url,crawled,primary,secondary,paragraphs
0,1237,http://www.sport-tv.si/d164779/Nogomet/Repreze...,2014,Encoding Issues,,"[{'text': 'Reprezentance', 'duplicate': False}..."
1,3280,http://revija-prijatelj.rkc.si/032002.htm,2014,Too Long,,"[{'text': '"" Na načelih si boš samo zobe polom..."
2,3274,http://www.sport-tv.si/contentList/Posts/13/to...,2014,Encoding Issues,,[{'text': 'V zadnji tekmi 10. kola italijanske...
3,2992,http://www.telos.si/viewcast/niagara/gostream.htm,2014,Machine Translation,,[{'text': 'Niagara GoStream - zajem in strujan...
4,3367,http://www.pcplus.si/periferija-in-dodatki/dod...,2014,Encoding Issues,,"[{'text': 'Opis izdelka:', 'duplicate': False}..."


In [53]:
from sklearn.model_selection import train_test_split

train, devtest = train_test_split(df, test_size=0.4, random_state=44, stratify=df.primary)
test, dev = train_test_split(devtest, test_size=0.5, random_state=100, stratify=devtest.primary)

train["split"] = "train"
dev["split"] = "dev"
test["split"] = "test"

df = pd.concat([train, dev, test], ignore_index=True)
df.head()


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

## Solution: put the unsplittable labels in TEST (as agreed in Skype meeting 2021-11-25T13:57:08).

In [65]:
dtgb = devtest.groupby("primary").count()
dtgb

Unnamed: 0_level_0,id,url,crawled,secondary,paragraphs
primary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boilerplate,1,1,1,1,1
Encoding Issues,3,3,3,3,3
Generated Text,1,1,1,1,1
HTML Source Code,1,1,1,1,1
Machine Translation,9,9,9,9,9
Multiple Texts,3,3,3,3,3
Non-textual,14,14,14,14,14
Not Slovene,7,7,7,7,7
Too Long,2,2,2,2,2
Too Short/Incoherent,9,9,9,9,9


Let's determine which labels are unsplittable:

In [66]:
dtgb = devtest.groupby("primary").count()
dtgb.loc[dtgb.url == 1].index

Index(['Boilerplate', 'Generated Text', 'HTML Source Code'], dtype='object', name='primary')

In [69]:
labels_to_put_in_test = ['Boilerplate', 'Generated Text', 'HTML Source Code']

buffer = devtest.loc[devtest.primary.isin(labels_to_put_in_test),:].copy()
devtest = devtest.loc[~devtest.primary.isin(labels_to_put_in_test),:].copy().reset_index(drop=True)
test, dev = train_test_split(devtest, test_size=0.5, random_state=100, stratify=devtest.primary)


test = pd.concat([test, buffer], ignore_index=True)

train["split"] = "train"
dev["split"] = "dev"
test["split"] = "test"

df = pd.concat([train, dev, test], ignore_index=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["split"] = "train"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev["split"] = "dev"


Unnamed: 0,id,url,crawled,primary,secondary,paragraphs,split
0,1237,http://www.sport-tv.si/d164779/Nogomet/Repreze...,2014,Encoding Issues,,"[{'text': 'Reprezentance', 'duplicate': False}...",train
1,486846,https://trgovina.troika.si/brother-toner?p=2,2021,Non-textual,,[{'text': 'Brezplačna dostava za vsa naročila....,train
2,32960728,http://sl.zd-grosuplje.si/dejavnosti-in-sluzbe...,2021,Non-textual,,"[{'text': 'Tinkara TRKAJ, dr. med., spec. pedi...",train
3,10710263,http://fonti.gifmania.si/L-Fonti/Loveness.htm,2021,Machine Translation,,"[{'text': 'Loveness', 'duplicate': False}, {'t...",train
4,3776987,http://locutio.si/avtorji.php?ID=145,2021,Non-textual,,"[{'text': 'Dušan Ludvik', 'duplicate': False},...",train


In [70]:
df.shape

(123, 7)

In [84]:
cols = df.columns
nonsuitable_content = list()
for row in df.iterrows():
    doc = dict()
    for col in cols:
        doc[col] = row[1][col]
    nonsuitable_content.append(doc)

In [85]:
nonsuitable_content[0]

{'id': '1237',
 'url': 'http://www.sport-tv.si/d164779/Nogomet/Reprezentance/Gliha_Za_uspeh_bo_treba_umreti_in....html',
 'crawled': '2014',
 'primary': 'Encoding Issues',
 'secondary': '',
 'paragraphs': [{'text': 'Reprezentance', 'duplicate': False},
  {'text': 'Gliha: Za uspeh bo treba " umreti " in...', 'duplicate': False},
  {'text': 'Mladinska slovenska nogometna reprezentanca bo danes odigrala Å ¡ e zadnje sreÄanje v uvodnih kvalifikacijah za evropsko prvenstvo leta 2013, ki bo v Litvi. Äeta Primoža Glihe je v skupini 10 trenutno sicer na zadnjem mestu, toda ob primernem razpletu na obeh dvobojih tretjega kola (beri: zmagah Slovenije in Rusije) si Å ¡ e lahko zagotovi drugo mesto ter s tem napredovanje v zakljuÄni del bojev za sedem mest, ki bodo naposled vodila na turnir stare celine selekcij do 19 let.',
   'duplicate': False},
  {'text': '15. oktober 2012. ob 01:38 | zadnja sprememba: 15. oktober 2012. ob 01:56',
   'duplicate': False},
  {'text': 'V dosedanjem poteku turnirj

## Putting in together

We are finally ready to do the same preprocessing as before (note that we do not have a tertiary label here.)

In [86]:
for doc in nonsuitable_content:
    # Adding downcasted labels
    for label in "primary,secondary".split(","):
        p = doc[label]
        doc[f"{label}_level_1"] = p

        # We do not downcast unsuitable labels:

        #doc[f"{label}_level_2"] = first_downcasting(p)
        #doc[f"{label}_level_3"] = second_downcasting(first_downcasting(p))
        del doc[label]
    # Adding split
    # We already have the split information.
    #doc["split"] = assign_split(doc["id"])
    # Adding domain:
    doc["domain"] = get_domain(doc["url"])
    
    # Validation
    assert len(doc["paragraphs"]) > 0, "Paragraphs is an empty list."
    #assert (doc["primary_level_2"] in utils.list_of_categories_matrix) or (doc["primary_level_2"] == ""), f"Weird: {doc['primary_level_2']}"
    assert doc["url"] != "" and isinstance(doc["url"], str)

with open("/home/peterr/macocu/task5_webgenres/data/finished_dataset/nonsuitable.json", "w") as f:
    json.dump(nonsuitable_content, f)

In [87]:
nonsuitable_content[0]

{'id': '1237',
 'url': 'http://www.sport-tv.si/d164779/Nogomet/Reprezentance/Gliha_Za_uspeh_bo_treba_umreti_in....html',
 'crawled': '2014',
 'paragraphs': [{'text': 'Reprezentance', 'duplicate': False},
  {'text': 'Gliha: Za uspeh bo treba " umreti " in...', 'duplicate': False},
  {'text': 'Mladinska slovenska nogometna reprezentanca bo danes odigrala Å ¡ e zadnje sreÄanje v uvodnih kvalifikacijah za evropsko prvenstvo leta 2013, ki bo v Litvi. Äeta Primoža Glihe je v skupini 10 trenutno sicer na zadnjem mestu, toda ob primernem razpletu na obeh dvobojih tretjega kola (beri: zmagah Slovenije in Rusije) si Å ¡ e lahko zagotovi drugo mesto ter s tem napredovanje v zakljuÄni del bojev za sedem mest, ki bodo naposled vodila na turnir stare celine selekcij do 19 let.',
   'duplicate': False},
  {'text': '15. oktober 2012. ob 01:38 | zadnja sprememba: 15. oktober 2012. ob 01:56',
   'duplicate': False},
  {'text': 'V dosedanjem poteku turnirja, ki se odvija na naših tleh, bolje reÄeno v Bak