Get posts from Facebook groups, and filter the text to get clean sentences in the target language.

In [None]:
!pip install facebook-scraper

In [None]:
!python -m spacy download en_core_web_sm

In [39]:
import json
from facebook_scraper import get_posts
from tqdm.notebook import tqdm
import re
import string
import numpy as np
import pprint
import spacy_fastlang

Define some functions to help with tidying up and standardising this text.

- Not all of it is in the target language, so make a heuristic guess about language ID based on vocabulary.
- Tidy up tags (e.g. #Bukedde)
- Standardise the case of the first word (e.g. for text such as "OMUBAKA wa Budaaki")

In [89]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("language_detector")

def get_social_posts(page_id,
                     num_pages_to_request = 100000,
                     get_comments = False):
    num_pages_to_request = 100000
    posts_iterator = get_posts(
        page_id, pages=num_pages_to_request,
        options={"comments": get_comments})
    posts_from_page = []
    try:
        for post in tqdm(posts_iterator):
            posts_from_page.append(post)
    # Return the current progress if interrupted,
    # rather than losing everything.
    finally:
        return posts_from_page

def file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines
    
def vocabulary(language_code):
    vocab = file_to_list('v7-dataset/v7.0/supervised/'
                         f'en-{language_code}/train.{language_code}')
    vocab = ' '.join(vocab)
    vocab = vocab.lower()
    vocab = vocab.split()
    vocab = [re.sub('[\W_]+', '', a) for a in vocab]
    vocab = [a for a in vocab if len(a)>3]
    vocab = set(vocab)
    return vocab

def is_target_language(text, vocab, print_stats = False,
                       min_recognised_words = 3, min_ratio = 0.3):
    '''Heuristic language id based on number of recognised words.'''
    words = [re.sub('[\W_]+', '', a) for a in text.lower().split()]
    in_vocab = [w in vocab for w in words]
    num_words_in_vocab = np.sum(in_vocab)
    ratio = (num_words_in_vocab + .01) / (len(words) + .01)
    if print_stats:
        print(in_vocab)
        print(ratio, num_words_in_vocab)

    return num_words_in_vocab >= min_recognised_words and ratio > min_ratio 

def first_word_title_case(text):
    words = text.split(' ')
    words[0] = words[0].title()
    return ' '.join(words)

def standardise_social_text(text):
    '''Remove tags and put first word in title case.'''
    strings_to_remove = [
        '… More', ' |monitor', '#dailymonitor', '#Bukedde',
        ' https://bit', ' http://bit', 'https://www', 'http://www'
    ]
    for s in strings_to_remove:
        text = text.replace(s, '')
    text = first_word_title_case(text)
    return text

def split_into_sentences(text):
    alphabets= "([A-Za-z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = ("(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|"
                "We\s|But\s|However\s|That\s|This\s|Wherever)")
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

def clean_sentences_from_posts(
    posts,
    vocab = None,
    remove_english = False,
    min_words = 3,
    print_wrong_language_sentences = False,
    print_stats = False,
):
    '''Filter for lines in the target language (if vocab provided), and tidy up.'''
    
    text = '\n'.join([p['text'] or '' for p in posts])
    lines = text.split('\n')
    
    
    stats = {'total': 0, 'ok': 0, 'wrong_language': 0, 'too_short': 0, 'not_text': 0}

    # De-deuplicate
    clean = []
    lines = list(set(lines))

    for line in lines:
        stats['total'] += 1
        words = line.split()
        too_short = len(words) <= min_words
        is_code = 'loadEventSupported' in line or 'requireLazy' in line
        is_date = (len(words) > 4 and
                   words[1] in ['January', 'February', 'March', 'April',
                                'May', 'June', 'July', 'August',
                                'September', 'October', 'November', 'December'])

        not_target_language = False
        
        if remove_english:
            not_target_language = nlp(line)._.language == 'en'
            
        if vocab:
            not_target_language = not is_target_language(line, vocab) 
            
        if print_wrong_language_sentences:
            if not_target_language and not (too_short or is_code or is_date):
                print(line)

        if not (too_short or is_code or is_date or not_target_language):
            line = standardise_social_text(line)
            clean.append(line)
            stats['ok'] += 1

        if not_target_language:
            stats['wrong_language'] += 1

        if too_short:
            stats['too_short'] += 1

        if is_code or is_date:
            stats['not_text'] += 1
            
    sentences = []
    for c in clean:
        current_sentences = split_into_sentences(c)
        for s in current_sentences:
            if len(s.split()) > 3:
                sentences.append(s)
    
    sentences = list(set(sentences))
    stats['num_sentences'] = len(sentences)
    
    if print_stats:
        pp = pprint.PrettyPrinter()
        pp.pprint(stats)
            
    return sentences



In [62]:
posts = get_social_posts('AICERIT')
sentences = clean_sentences_from_posts(posts, print_stats = True)
with open("back_translation/teo/aicerit.txt", "w") as f:
    f.writelines('\n'.join(sentences))

0it [00:00, ?it/s]

{'not_text': 266,
 'num_sentences': 1064,
 'ok': 877,
 'too_short': 342,
 'total': 1485,
 'wrong_language': 0}


In [49]:
posts = get_social_posts('bukedde.ug')
sentences = clean_sentences_from_posts(posts, print_stats=True)
with open("back_translation/lug/bukedde.txt", "w") as f:
    f.writelines('\n'.join(sentences))

{'not_text': 3953,
 'num_sentences': 26688,
 'ok': 10390,
 'too_short': 7702,
 'total': 22045,
 'wrong_language': 0}


In [88]:
posts = get_social_posts('rupiny.newspaper.7')
sentences = clean_sentences_from_posts(posts, print_stats = True,
                                       print_wrong_language_sentences=False,
                                       vocab = vocabulary('ach'))
with open("back_translation/ach/rupiny.txt", "w") as f:
    f.writelines('\n'.join(sentences))

{'not_text': 117,
 'num_sentences': 159,
 'ok': 166,
 'too_short': 263,
 'total': 733,
 'wrong_language': 566}


In [90]:
posts = get_social_posts('DailyMonitor')
sentences = clean_sentences_from_posts(posts, print_stats = True)
with open("back_translation/eng/daily-monitor.txt", "w") as f:
    f.writelines('\n'.join(sentences))

0it [00:00, ?it/s]

  date_obj = stz.localize(date_obj)


{'not_text': 9436,
 'num_sentences': 22739,
 'ok': 45437,
 'too_short': 15118,
 'total': 69991,
 'wrong_language': 0}
