## Preprocess

In [None]:
import spacy
from pathlib import Path
from tqdm import tqdm
from spacy import displacy
import uuid
from tqdm.auto import tqdm

In [None]:
INPUT_PATH = "/lium/home/tprouteau/git/sinr_embeddings/notebooks/afp_5000_firstlines.txt"
OUTPUT_PATH = "./processed/"
N_JOBS = 20
REGISTER = "web" # Genre of data (metadata, use whatever you want)
LANGUAGE = "fr" # Language of the corpus

nlp = spacy.load("fr_core_news_lg") # Load the model into SpaCy
_ = nlp.add_pipe("merge_entities", after="ner") # Merge Named-Entities

Path(OUTPUT_PATH).mkdir(exist_ok=True)
corpus_output_path = Path(OUTPUT_PATH) / f"{Path(INPUT_PATH).stem}.vrt" # Output path to write the corpus file
corpus_output_path.touch() # Create the output file
with corpus_output_path.open("w") as file:
    id_corpus = str(uuid.uuid4()) # Generate a random corpus id
    file.write(f'<text id="{id_corpus}" filename="{Path(INPUT_PATH).as_posix()}" register="{REGISTER}" language="{LANGUAGE}">\n' ) # Write corpus identifier
    data_file = Path(INPUT_PATH).open("r")
    data = data_file.read().splitlines() # Read INPUT_FILE
    print(len(data))
    data_file.close()
    for doc in tqdm(nlp.pipe(data, n_process=N_JOBS), total=len(data)): 
        for sent in doc.sents: # Sentence border detection
            file.write("<s>\n") # Write a sentence start
            for token in sent: # For each token
                if token.ent_type_ == '': # If current token is not a Named-Entity
                    ent_type = None
                    text = token.text
                    lemma = token.lemma_
                else:
                    ent_type = token.ent_type_
                    if ' ' in token.text: # Entities are merged with a space by default
                        text = token.text.replace(' ', '_') # We want to merge named entities with a _
                        lemma = text
                    else:
                        text = token.text
                        lemma = text
                content = "\t".join([text,
                                     lemma, 
                                     token.pos_,  
                                     token.ent_iob_, 
                                     str(ent_type), 
                                     str(token.is_punct), 
                                     str(token.is_stop), 
                                     str(token.is_alpha), 
                                     str(token.is_digit), 
                                     str(token.like_num)])
                file.write(f'{content}\n') # Write the token info
            file.write("</s>\n")
print(f"VRT-style file written in {corpus_output_path}")

## Loading

In [None]:
import re
import tabulate
from collections import Counter
def extract_text(text, lemmatize=True, stop_words=False, lower_words=True, number=False, punct=False, exclude_pos=[], en=True, min_freq=10, alpha=True, exclude_en=[], min_length_word=3):
    '''corpus_path
    Extracts the text from a VRT corpus file.
    
    
    Parameters:
    corpus_path (str|pathlib.Path): Path to the corpus file.
    lemma (bool): Return lemmatized text (default: True).
    stop_words (bool): Keep stop-words (default: False).
    lower (bool): Put the text in lowercase (default: True).
    number (bool): Keep the numbers (default: False).
    punct (bool): Keep the punctuation (default: False).
    exclude_pos (list): List of part-of speech (from spacy) to exclude) (default: []).
    en (bool): Keep named entities (default:True)
    min_freq (int): Minimum number of occurrences to keep a token (default: 10).
    alpha (bool): Keep alphanumeric characters (default: False).
    exclude_en (list): List of named-entities types to exclude (default: []).
    
    
    Return:
    text (list(list(str))): A list of sentences containing words
    '''
    # corpus_file = open_corpus(corpus_path)
    # line = corpus.readline().rstrip()
    # text = []
    out = []
    pattern = re.compile(r"<text[^<>]*\"\>{1}")
    stop_words, number, punct, alpha = str(stop_words), str(number), str(punct), str(alpha)
    sentence = []
    # x=0
    # while line!='' and x<1000:
    for line in tqdm(text, total=len(text)):
        if line.startswith("<s>"):
            sentence = []
        elif line.startswith("</s>"):
            if len(sentence)>2:
                out.append(sentence)
        elif len(pattern.findall(line)) > 0:
            pass
        else:
            listline=line.split("\t")
            if len(listline) == 10 :
                for i in listline :
                    if bool(re.match('^\t\t',str(i))) :
                        continue
                token, lemma, pos, ent_iob, ent_type, is_punct, is_stop, is_alpha, is_digit, like_num = line.split("\t")
                if lemmatize:
                    if stop_words==is_stop and is_punct == punct and is_digit == number and like_num == number and not pos in exclude_pos and not ent_type in exclude_en and (alpha == is_alpha or ent_type != "None"):
                        if exclude_en == True and ent_iob != "None":
                            pass
                        else:
                            if lower_words:
                                if ent_type != "None" and len(lemma)>1:                                
                                    sentence.append(token)#sentence.append(lemma.lower())
                                    # print(lemma)
                                elif len(lemma) > min_length_word:
                                    sentence.append(lemma.lower())
                            else:
                                if ent_type != "None":                                
                                    sentence.append(token)
                                elif len(lemma) > min_length_word:
                                    sentence.append(lemma)
                    else:
                        pass
                else:
                    if stop_words==is_stop and is_punct == punct and is_digit == number and alpha == is_alpha and like_num == number and not pos in exclude_pos and not ent_type in exclude_en:
                            if exclude_en == True and ent_iob != "None":
                                pass
                            else:
                                if lower==True:
                                    if ent_type != "None" and len(token)>1:                                
                                        sentence.append(token) #(token)
                                        print(setence)
                                    elif len(token) > min_length_word:
                                        sentence.append(token.lower())
                                    #print(lower(lemma))
                                else:
                                    if ent_type != "None":                                
                                        sentence.append(token) #(token)
                                    elif len(lemma) > min_length_word:
                                        sentence.append(token)
            else:
                continue
    if min_freq > 1:
        counts = Counter([word for sent in out for word in sent])
        accepted_tokens = {word for word, count in counts.items() if count>=min_freq}
        out = [[word for word in sent if word in accepted_tokens] for sent in out]
        # line = corpus.readline().rstrip()
        # x+=1
    return out
    
def open_corpus(corpus_path):
    if isinstance(corpus_path, str):
        corpus_file = Path(corpus_path).open("r")
    elif isinstance(corpus_path, Path):
        corpus_file = corpus_path.open("r")
    else:
        raise TypeError(f"Path to corpus : corpus_path must be of type str or pathlib.Path. Provided corpus_path type is {corpus_path.type}")
    return corpus_file

In [None]:
vrt = open_corpus("/lium/raid01_c/sguillot/sinr_exps/smalldiach_years_trueprocessed/fatnews.vrt").read().splitlines() # Load the vrt lines
text = extract_text(vrt,lemmatize=True, lower_words=True, min_freq=5) # Extract the text needed
_ = [print(sent) for sent in text[:5]]