# Imports

In [2]:
import json
import os
import bz2
import io
from bz2 import BZ2File
import pandas as pd
import numpy as np

# Read dataset

In [2]:
input_dir = "data/" # update with your path 

In [3]:
# a helper function to get the lines from am archive
def read_jsonlines(bz2_file):
    text = bz2_file.read().decode('utf-8')
    for line in text.split('\n'):
        if line != '':
            yield line

In [4]:
archives = os.listdir(input_dir)[:4] + os.listdir(input_dir)[5:]

# Extract articles

In [5]:
id_, journal_, date_, page_, text_ = [], [], [], [], []

for archive in archives:
    
    # take only the transformed archives
    # open the archive
    f = BZ2File(os.path.join(input_dir, archive), 'r')
        
    # get the list of articles it contains (= a json object on each line)
    articles = list(read_jsonlines(f))
    
    print(archive, ':', len(articles), 'articles à extraire')
        
    # load the first 100 articles as json and access their attributes    
    for a in articles:
         
        # decode the json string into an object (dict)
        json_article = json.loads(a)
        journal_.append(str(json_article["id"])[:3])
        date_.append(str(json_article["id"])[4:14])
        page_.append(str(json_article["pp"])[1:-1])
        text_.append(str(json_article["ft"]))


JDG-1978.jsonl.bz2 : 26712 articles à extraire
GDL-1981.jsonl.bz2 : 31463 articles à extraire
JDG-1975.jsonl.bz2 : 24134 articles à extraire
GDL-1976.jsonl.bz2 : 20457 articles à extraire
JDG-1973.jsonl.bz2 : 25514 articles à extraire
JDG-1979.jsonl.bz2 : 25891 articles à extraire
GDL-1971.jsonl.bz2 : 29228 articles à extraire
GDL-1980.jsonl.bz2 : 29077 articles à extraire
JDG-1974.jsonl.bz2 : 24927 articles à extraire
GDL-1977.jsonl.bz2 : 27887 articles à extraire
JDG-1972.jsonl.bz2 : 23188 articles à extraire
GDL-1972.jsonl.bz2 : 25406 articles à extraire
JDG-1977.jsonl.bz2 : 25989 articles à extraire
JDG-1980.jsonl.bz2 : 27425 articles à extraire
GDL-1974.jsonl.bz2 : 27028 articles à extraire
GDL-1979.jsonl.bz2 : 28014 articles à extraire
JDG-1971.jsonl.bz2 : 24524 articles à extraire
GDL-1973.jsonl.bz2 : 27885 articles à extraire
JDG-1976.jsonl.bz2 : 25187 articles à extraire
JDG-1981.jsonl.bz2 : 27082 articles à extraire
GDL-1975.jsonl.bz2 : 25420 articles à extraire
GDL-1978.json

# Create a pandas DataFrame

In [6]:
df = pd.DataFrame.from_dict(
    {
        'journal': journal_,
        'date': date_,
        'page': page_,
        'text': text_
    })

In [7]:
df.head()

Unnamed: 0,journal,date,page,text
0,JDG,1978-12-29,1,Algérie : l'arbitrage sans arbitre Pour un Eta...
1,JDG,1978-12-29,3,ALORS QUE LA FOULE ENTERRE SES DERNIERES VICTI...
2,JDG,1978-12-29,4,NEW COTÉES EN SUISSE 27 28 aboott Labor aetnft...
3,JDG,1978-12-29,5,"Hôtellerie suisse : baisse des nuitées Berne, ..."
4,JDG,1978-12-29,6,Les prix ne monteront pas en 1979 Un Suisse su...


In [64]:
len(df)

454898

# Clean and enrich the DataFrame 

In [17]:
def measure_articles(df):
    
    lengths = []
    
    for ind, row in df.iterrows():
        lengths.append(len(row['text']))
        
    return lengths

df['length'] = measure_articles(df)

In [23]:
def handle_multiple_pages(df):
    
    page, ppage = [], []
    
    for ind, row in df.iterrows():
        
        found = re.findall('([0-9]+)', row['page'])
        if len(found) > 1:
            page.append(found[0])
            ppage.append(found[1])
        else:
            page.append(row['page'])
            ppage.append(np.nan)
        
    return page, ppage
    
page, ppage = handle_multiple_pages(df)
df['page'] = page
df['ppage'] = ppage

In [24]:
df.head()

Unnamed: 0,journal,date,page,text,length,ppage
0,JDG,1978-12-29,1,Algérie : l'arbitrage sans arbitre Pour un Eta...,3777,
1,JDG,1978-12-29,3,ALORS QUE LA FOULE ENTERRE SES DERNIERES VICTI...,3401,
2,JDG,1978-12-29,4,NEW COTÉES EN SUISSE 27 28 aboott Labor aetnft...,36446,
3,JDG,1978-12-29,5,"Hôtellerie suisse : baisse des nuitées Berne, ...",807,
4,JDG,1978-12-29,6,Les prix ne monteront pas en 1979 Un Suisse su...,2934,


In [36]:
# Jeter les articles vides ou ne contenant que quelques caractères (p.ex titre des rubriques)
df = df[df['length'] > 50]

In [37]:
# Sauvegarder l'index
df['id'] = df.index

In [59]:
# Formater les types
df['date'] = pd.to_datetime(df['date'])
df['journal'] = df['journal'].astype('category')
df['page'] = df['page'].astype('float')
df['ppage'] = df['ppage'].astype('float')
df['text'] = df['text'].astype('str')

In [67]:
df = df[['id', 'journal', 'date', 'page', 'ppage', 'text', 'length']]

In [69]:
df.to_json('cleaned.json.bz2', compression = 'bz2')

In [71]:
df_lengths = df['length'].value_counts()

In [None]:
def check_for_duplicates(df):
    
    identical_texts = []
    
    for ind1, row1 in df.iterrows():
        for ind2, row2 in df.iterrows():
            
            if row1['length'] == row2['length']:

In [37]:
def compare_texts(txt1, txt2):
    
    identical = False
    recover = 0
        
    ### First step is a rough assessment ###
    
    text1 = txt1.split(' ')
    text2 = txt2.split(' ')
    
    if (len(text1) > 40) and (len(text2) > 40):
        text1, text2 = text1[20:40], text2[20:40]
        
    elif (len(text1) > 20) and (len(text2) > 20):   
        text1, text2 = text1[:20], text2[:20]
        
    else:
        min_ = min(len(text1), len(text2))
        text1, text2 = text1[:min_], text2[:min_]
        
    if set(text1) == set(text2):
        return True
    
    differentials = np.nan
    grade = 0
    
    # estimate identity 
    for i in range(len(text1)):
        for j in range(len(text2)):
            if text1[i] == text2[j]:
                grade += 1
                differentials.append = (j - i)
    
    _ = np.unique(differentials, return_counts = True)
    
    if len(_) == 0:
        return False
    
    differential = _[0].tolist()[np.argmax(_[1].tolist())]
    
    ### Second step is a finer assessment ###
    if (grade >= 10) and (np.argmax(_[1].tolist()) >= 10): 
        return True
    
    else:
        return False
    
    