First, pull down posts and messages from the Facebook group `rupiny.newspaper.7`. These are mostly news headlines and a few replies in Acholi, with some English in there too which has to be filtered out.

In [None]:
!pip install facebook-scraper

In [95]:
import json
from facebook_scraper import get_posts
from tqdm.notebook import tqdm
import re, string
import numpy as np

In [20]:
num_pages_to_request = 100000
posts_iterator = get_posts('rupiny.newspaper.7', pages=num_pages_to_request, options={"comments": True})

Took about 1.5 hours for a test run of 10k pages (back to ~2013)

In [21]:
posts = []
for post in tqdm(posts_iterator, total=num_pages_to_request):
    posts.append(post)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [42]:
text = '\n'.join([p['text'] or '' for p in posts])
lines = text.split('\n')

Define some functions to make a heuristic guess whether some text is in the target language.

In [231]:
def is_target_language(text, vocab, print_stats = False):
    '''Heuristic language id based on number of recognised words.'''
    words = [re.sub('[\W_]+', '', a) for a in text.lower().split()]
    in_vocab = [w in vocab for w in words]

    num_words_in_vocab = np.sum(in_vocab)
    ratio = (num_words_in_vocab + .01) / (len(words) + .01)
    if print_stats:
        print(in_vocab)
        print(ratio, num_words_in_vocab)
    return num_words_in_vocab >= 3 and ratio > 0.3  

def file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines

vocab = file_to_list('translation/v7-dataset/v7.0/supervised/en-ach/train.ach')
vocab = ' '.join(vocab)
vocab = vocab.lower()
vocab = vocab.split()
vocab = [re.sub('[\W_]+', '', a) for a in vocab]
vocab = [a for a in vocab if len(a)>3]
vocab = set(vocab)

Strip out the lines of text that are in the wrong language, or are a date or too short, or are some html/javascript code.

In [233]:
print_wrong_language_sentences = False

clean = []
for line in lines:
    words = line.split()
    too_short = len(words) <= 3
    is_code = 'loadEventSupported' in line or 'requireLazy' in line
    is_date = (len(words) > 4 and
               words[1] in ['January', 'February', 'March', 'April',
                            'May', 'June', 'July', 'August',
                            'September', 'October', 'November', 'December'])

    not_target_language = not is_target_language(line, vocab) 
    if print_wrong_language_sentences
        if not_target_language and not (too_short or is_code or is_date):
            print(line)
    if not (too_short or is_code or is_date or not_target_language):
        clean.append(line)
        
clean = list(set(clean))

Now break this down into sentences and save as plain text.

In [236]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [242]:
sentences = []
for c in clean:
    current_sentences = split_into_sentences(c)
    for s in current_sentences:
        if len(s.split()) > 3:
            sentences.append(s)

In [246]:
len(sentences)

2537

In [245]:
sentences[-100:]

['Gurup ame onwongo otye apwonyo ni oya i lobo me f Iraq kede Syria,man en dul ame obedo bolo aranyi i lobo me United States.',
 'Sanchez tye abango mucara me ciling £500,000- cabit kede cabit ingee weko arsenal I dwe me acel I mwaka 2018.',
 'Ka ingeyo lapwony mo ma tio ki diro ne weng me neno ni lutino aniang jami ma en pwonyo wek kwo gi obed maber ikare me anyim, cwali wa nyinge kede kan ma en pwonyo iye oyot-oyot wek omi lwak nge lok ikome.',
 'Teceng: Gitumu nye mo bi butu ki maro i poto gwana!',
 'Kumu dok i Gulu inge motoka goyo lapwony me Gulu University Ballam Nyeko wa itoo.',
 'Nyeko obedo omin lacoc ma megwa Dennis Ojwee matidi.',
 'Kabedo tye pi otino amito wot kwano S1 i Wairaka College kede Jinja College.',
 'Bin icoye pi admission i opici me Rupiny/New Vision Lira.',
 'Iromo dang goyo cim namba:0776944990 pi ngec okene.',
 '• Ka onongo itamo ni dano ma I jela weng cwiny gi rac ci tam odoco: Mabuc 37 ocone me doko lupwonye dini.',
 'Lira University (Uganda Government fund

In [253]:
with open("acholi-rupiny.txt", "w") as f:
    f.writelines('\n'.join(sentences))