In [1]:
# resources available at : https://www.sadilar.org/

# language   - corpora : Language codes

# isiNdebele - nr  : https://repo.sadilar.org/handle/20.500.12185/308
# isiXhosa   - xh  : https://repo.sadilar.org/handle/20.500.12185/314
# isiZulu    - zu  : https://repo.sadilar.org/handle/20.500.12185/321
# sePedi     - nso : https://repo.sadilar.org/handle/20.500.12185/330
# seSotho    - st  : https://repo.sadilar.org/handle/20.500.12185/336
# seTswana   - tn  : https://repo.sadilar.org/handle/20.500.12185/343
# siSwati    - ss  : https://repo.sadilar.org/handle/20.500.12185/348
# tshiVenda  - ve  : https://repo.sadilar.org/handle/20.500.12185/357
# xiTsonga   - ts  : https://repo.sadilar.org/handle/20.500.12185/364
# Afikaans   - af  : https://repo.sadilar.org/handle/20.500.12185/293
# English    - en  : https://repo.sadilar.org/handle/20.500.12185/301

In [2]:
import os
import zipfile


## Preparation

In [69]:
file_listing = os.listdir('../data/corpora')

language_codes = [i.replace("corpora.nchlt.","").replace(".zip","") for i in file_listing]
language_codes

['af', 'en', 'nr', 'nso', 'ss', 'st', 'tn', 'ts', 've', 'xh', 'zu']

In [82]:
filename.format(code)

'en/1.Corpus/CORP.NCHLT.eng.CLEAN.1.0.0.txt'

In [88]:
zipped, language_codes[idx], code, zfilename

('corpora.nchlt.af.zip',
 'af',
 'af',
 'af/1.Corpus/CORP.NCHLT.{}.CLEAN.1.0.0.txt')

In [95]:
prepped_documents = []

for idx, zipped in enumerate(file_listing):
    if language_codes[idx] == 'en':
        zfilename = '{}/1.Corpus/CORP.NCHLT.{}.CLEAN.1.0.0.txt'.format(language_codes[idx],"{}")
    else:
        zfilename = '{}/2.Corpora/CORP.NCHLT.{}.CLEAN.2.0.txt'.format(language_codes[idx],"{}")
    
    
    with zipfile.ZipFile('../data/corpora/{}'.format(zipped)) as z:
        language_codes[idx] if True else 'no'
        code = 'eng' if language_codes[idx] == 'en' else language_codes[idx]
        
        with z.open(zfilename.format(code)) as f:
            lines = f.readlines()

    text = "".join([str(i) for i in lines[::11]])
    documents = text.replace("\\n","").split("<fn>")
    
    for document in documents:
        document_split = document.split("</fn>")
        if len(document_split) > 1:
            title, body = document_split
            prepped_documents.append({'title': title, 'body': body, 'class': language_codes[idx]})

## A quick look at the data

In [97]:
import pandas as pd

In [105]:
documents_df = pd.DataFrame(prepped_documents)
documents_df.to_csv('../data/training_data.csv')

In [125]:
documents_df.head()

Unnamed: 0,body,class,title
0,"\r'b""Werkende aansoekers: Dui asseblief jou ba...",af,(1124200875659 AM) GEMS_APP(AFRIKAANS).txt
1,\r'b'Uitstekende gesondheidsorgvoordele teen b...,af,(3102010124331 PM) FINAL MB AFRIKAANS.txt
2,\r'b'Uitstekende gesondheidsorgvoordele teen b...,af,(3172010120044 PM) FINAL MB AFRIKAANS.txt
3,\r'b'Om alle staatsdienswerknemers van gelyke ...,af,(32201090225 AM) REO Afrikaans met benefit sch...
4,"\r'b""Sou u na 1 Januarie van 'n jaar aansluit,...",af,(42200942018 PM) NIMAS_Appli. FormAfr_PD7023.txt


In [107]:
# document counts

documents_df['class'].value_counts()

af     436
en     355
xh     240
st      98
nr      93
tn      90
zu      81
nso     72
ss      69
ve      63
ts      53
Name: class, dtype: int64

In [126]:
# sentence counts - beautifully balanced

documents_df['body'].apply(lambda x: len(x.split(". "))).dot(pd.get_dummies(documents_df['class'])) 

af     4645
en     9736
nr     2520
nso    3591
ss     3148
st     3189
tn     2394
ts     2635
ve     2053
xh     3274
zu     3371
Name: body, dtype: int64

In [139]:
sentences = documents_df.apply(lambda x: [{'class': x['class'], 'body': b} for b in x['body'].split(". ")], 
                              axis=1).values.tolist()

In [145]:
sum([len(i) for i in sentences])

40556

In [150]:
import json
with open('../data/sentences.json','w') as datafile:
    json.dump(sentences, datafile)

In [127]:
# word counts

documents_df['body'].apply(lambda x: len(x.split())).dot(pd.get_dummies(documents_df['class'])) 

af     194856
en     487449
nr      67973
nso    187641
ss      69755
st     159820
tn     110827
ts     115381
ve      86720
xh      98303
zu     128647
Name: body, dtype: int64

In [124]:
# character counts - beautifully balanced

documents_df['body'].apply(lambda x: len(list(x))).dot(pd.get_dummies(documents_df['class'])) 

af     1333795
en     3178123
nr      693335
nso    1277737
ss      698910
st      922541
tn      652337
ts      715758
ve      589877
xh      953090
zu     1278794
Name: body, dtype: int64

In [121]:
list("""\r'b"Sou u na 1 Januarie van 'n jaar aansluit,""".replace("""\r\'b\"""",""))

['S',
 'o',
 'u',
 ' ',
 'u',
 ' ',
 'n',
 'a',
 ' ',
 '1',
 ' ',
 'J',
 'a',
 'n',
 'u',
 'a',
 'r',
 'i',
 'e',
 ' ',
 'v',
 'a',
 'n',
 ' ',
 "'",
 'n',
 ' ',
 'j',
 'a',
 'a',
 'r',
 ' ',
 'a',
 'a',
 'n',
 's',
 'l',
 'u',
 'i',
 't',
 ',']