#Model training for recognising press article titles
*Author: Nada Lasri*

This notebook aims at train models in several languages (mainly French) for recognising article titles and links

In [1]:
import requests 
import re
from bs4 import BeautifulSoup
import datetime
import urllib
import requests
import lxml.html
import json
import csv
import sys
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import torch
from transformers import CamembertModel
from transformers import CamembertTokenizer
from transformers import glue_convert_examples_to_features, InputExample

import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
#Add any needed library for installation
!pip install sentencepiece
!pip install hydra-core
!pip install omegaconf
!pip install simpletransformers
!pip install bs4
!pip install lxml
!pip install tensorflow-gpu
!pip install tensorflow==2.4.1
!pip install tqdm==4.41.1
!pip install tokenizers==0.9.4

Collecting tqdm>=4.47.0
  Downloading tqdm-4.60.0-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 8.0 MB/s  eta 0:00:01
  Using cached tqdm-4.49.0-py2.py3-none-any.whl (69 kB)




Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.41.1
    Uninstalling tqdm-4.41.1:
      Successfully uninstalled tqdm-4.41.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fast-bert 1.9.7 requires tokenizers==0.8.1.rc1, but you have tokenizers 0.10.1 which is incompatible.
fast-bert 1.9.7 requires transformers==3.0.2, but you have transformers 4.4.2 which is incompatible.[0m
Successfully installed tqdm-4.49.0


Collecting tqdm==4.41.1
  Using cached tqdm-4.41.1-py2.py3-none-any.whl (56 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.49.0
    Uninstalling tqdm-4.49.0:
      Successfully uninstalled tqdm-4.49.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
simpletransformers 0.61.4 requires tqdm>=4.47.0, but you have tqdm 4.41.1 which is incompatible.
fast-bert 1.9.7 requires tokenizers==0.8.1.rc1, but you have tokenizers 0.10.1 which is incompatible.
fast-bert 1.9.7 requires transformers==3.0.2, but you have transformers 4.4.2 which is incompatible.[0m
Successfully installed tqdm-4.41.1
Collecting tokenizers==0.9.4
  Using cached tokenizers-0.9.4-cp38-cp38-manylinux2010_x86_64.whl (2.9 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tok

In [1]:
#verify library versions
! pip list | grep "tensorflow"   # Check tensorflow==2.0.0, tensorflow-gpu==2.0.0
! pip list | grep "transformers" # Check transformers>=2.0.0

tensorflow              2.4.1
tensorflow-estimator    2.4.0
transformers            4.3.3


Before training our model, we scrap the main pages of a selection of newspapers in the same language and save all the links found in those pages and belong to the same website

In [4]:
internal_urls = {}

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            continue
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        
        if not is_valid(href):
        # not a valid URL
            continue
            
        if href in internal_urls:
            # already in the set
            continue
        
        if domain_name not in href:
            # external link
            continue
            
        urls.add(href)
        title = a_tag.getText().strip().lstrip()       
        title = title.replace(';','')
        title = title.replace('""','')
        title = title.replace("\n", "")
        title = title.replace("\t", "")
        if title == "" or title is None:
            continue
        internal_urls[title] = href
        print(title + ";" + href + ";")

    return internal_urls

def write_in_csv(url_dict):
    with open('url_prenote.csv', mode='a', encoding="utf-8", newline='') as employee_file:
        employee_writer = csv.writer(employee_file, delimiter=';')

        for url in url_dict:
            employee_writer.writerow([url, url_dict[url]])

In [None]:
#Choose the newspapers you want to work on
url_inputs = {
  "20 Minutes": "https://www.20minutes.fr/"
}


In [None]:
for url in url_inputs:
    write_in_csv(get_all_website_links(url_inputs.get(url)))

##Corpus annotation
Before training the BERT model, we must have an annotated corpus. The annotation task is to be done manually. However, some heuristics might help shorten the labelling process.
For instance, we might consider any four-word-long or longer title to be indeed an article title, whereas shorter articles are not. This is of course to be checked and corrected manually. 

In [None]:
import pandas as pd
df_annote=pd.read_csv('url_prenote.csv', sep=';',header=None)

df_annote = df_annote.replace('\n','', regex=True) 
df_annote = df_annote.fillna(0)

df_annote[2]=0

In [None]:
df_annote

In [None]:
df_annote.columns = ['title', 'link', 'prediction']
df_annote['prediction'] = df_annote['title'].apply(lambda x: 1 if len(x.split()) > 4 else 0)
df_annote.loc[df_annote['prediction'] ==1]

In [None]:
df_annote

In [None]:
df_annote.to_csv('url_noted.csv', index=False, sep=';')

##Annotated corpus import and quick check

In [5]:
import pandas as pd
df=pd.read_csv('url_noted.csv', sep=';')

df = df.replace('\n','', regex=True) 
df = df.fillna(0)

In [6]:
df

Unnamed: 0,title,link,prediction
0,Aller au contenu,https://www.lefigaro.fr/,0
1,Politique,https://www.lefigaro.fr/politique,0
2,International,https://www.lefigaro.fr/international,0
3,Société,https://www.lefigaro.fr/actualite-france,0
4,Vox,https://www.lefigaro.fr/vox,0
...,...,...,...
2535,Conditions générales et particulières,https://www.lesechos.fr/terms,0
2536,Charte éthique,https://www.lesechos.fr/ethical-code,0
2537,En direct,https://www.lesechos.fr/direct,0
2538,Thema,https://www.lesechos.fr/thema,0


In [None]:
df.columns = ['title', 'link', 'prediction']
df['title']

In [None]:
from os import walk, path

In [None]:
datasets = {}

directory_path = "/home/cedric/Projet_LIVRONS/psat-elod-add_param_intput_output_interval/PSAT-master/PSAT-master/test_scrapper_outputs/aggs_annotated_fr"
_, _, filenames = next(walk(directory_path))

n_sources = 7
sources_count = 0

dfs = []
for filename in filenames:
    
    if filename in ['aggs_output_11_Mediapart.csv', 'aggs_output_5_FranceInter.csv']:
        continue
        
    sources_count+=1
    print(filename)
    file_path = path.join(directory_path, filename)
    df = pd.read_csv(file_path, sep=";").iloc[2:].rename(columns={'Unnamed: 0': 'link'})
    
    renamed_df = df[['title','link', 'annotated_class']].rename(columns={'annotated_class':'prediction'})
    dfs.append(renamed_df)
                                                                
    if sources_count == n_sources:
        break
        
    
                                                                
df = pd.concat(dfs)
df['prediction']=df['prediction'].astype('float')

In [None]:
df

#Model training

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

NameError: name 'DistilBertTokenizerFast' is not defined

In [12]:
train, valid = train_test_split(df, test_size=0.2)
train = train.dropna()
valid = valid.dropna()
training_sentences = train.iloc[:, 0]
validation_sentences = valid.iloc[:, 0]
training_labels = train.iloc[:, 2]
validation_labels = valid.iloc[:, 2]

train_encodings = tokenizer(training_sentences.tolist(),truncation=True,padding=True)
val_encodings = tokenizer(validation_sentences.tolist(),truncation=True,padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),training_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings),validation_labels))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [3]:
train

NameError: name 'train' is not defined

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(50).batch(16), 
          epochs=3, 
          batch_size=16, 
          validation_data=val_dataset.shuffle(50).batch(16))


In [2]:
#Model backup
model.save_pretrained("./french_model")

NameError: name 'model' is not defined

In [None]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("./BERT_classifier_fr")

##Quick test of the model

In [None]:
test_sentence_yes_0 = "Une année américaine, un numéro spécial indispensable pour comprendre les États-Unis"
test_sentence_yes_1 = "Le scandale Sanofi continue : incapable de sortir un vaccin anti-Covid, le labo saborde encore sa recherche"
test_sentence_no_0 = "France"
test_sentence_no_1 = "Sommaire"
test_sentence_no_2 = "Société"

test = list()
test.append(test_sentence_yes_0)
test.append(test_sentence_yes_1)
test.append(test_sentence_no_0)
test.append(test_sentence_no_1)
test.append(test_sentence_no_2)
test.append("Lyon. Derrière la rénovation du quartier de la Duchère, le cauchemar des habitants")
test.append("Article réservé aux abonn")

output = list()
for test_unit in test:
  predict_input = tokenizer.encode(test_unit,
                                  truncation=True,
                                  padding=True,
                                  return_tensors="tf")
  tf_output = loaded_model.predict(predict_input)[0]
  output.append(tf_output)


In [None]:
for output_unit in output:
  tf_prediction = tf.nn.softmax(output_unit, axis=1).numpy()[0]
  print(tf_prediction[1])


In [None]:
output