In [1]:
import itertools
import nltk
from nltk.probability import FreqDist
from nltk import sent_tokenize, word_tokenize
import pandas as pd
import pygsheets
import re
import string 

In [2]:
df=pd.read_csv('sw_string.txt', header = None)

In [3]:
text = df.iloc[0][0]

In [4]:
def tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]

    with open('sw_corpus_tokenized.txt', 'w+') as f:
        for word in tokens:
            f.write("%s " % word)
    
    return tokens

In [5]:
def ngrams(corpus, n):
    
    output=[]
    
    for i in range(len(corpus)-n+1):
        output.append(' '.join(corpus[i:i+n]))
        
    return output

In [6]:
data = tokenizer(text)

sw_one_word = {k:v for (k,v) in FreqDist(data).items()}
sw_two_word = {k:v for (k,v) in FreqDist(ngrams(data, 2)).items()}
sw_three_word = {k:v for (k,v) in FreqDist(ngrams(data, 3)).items()}
sw_four_word = {k:v for (k,v) in FreqDist(ngrams(data, 4)).items()}

#concatenate frequencies into one dictionary
# ** is the dictionary unpacking operator
sw_frequency = {**sw_one_word, **sw_two_word, **sw_three_word, **sw_four_word}

sw_frequency_frame = pd.DataFrame.from_dict(sw_frequency, orient='index', columns=["Frequency"])  
sw_frequency_frame.sort_values(by=['Frequency'], ascending=False).to_csv(r'/Users/kaigiomi/Desktop/Senior Year Sem 2/Uliza/sw_frequency.csv', header=True)

In [7]:
#Lists of easily tagged prepositions, pronouns , determiners, and some adjectives

PREPOSITIONS = ["kwa", "mwa", "wa", "la", "cha", "vya", "ya", "za", "pa", "na", "nami", "nawe", "nasi", "nanyi", "naye", 
                "nao", "nayo", "nalo", "nacho", "navyo", "nazo", "nako", "napo", "namu"]
  
PRONOUNS = ["mimi", "wewe", "sisi", "ninyi", "nyinyi", "yeye", "wao"]

DETERMINERS = ["huyu", "hawa", "huu", "hii", "hilo", "haya", "hiki", "hivi", "hii", "hizi", "huku", "hapa",
               "humu", "yule", "wale", "ule", "ile", "lile", "yale", "kile", "vile", "zile", "ule", "kule", 
               "pale", "mle", "huyo", "hao", "huo", "hiyo", "hilo", "hayo", "hicho", "hivyo", "hiyo", "hizo", 
               "huko", "hapo", "humu"]

RELATIVE_ADJECTIVES_PREFIXES = ["ninaye", "niliye", "nitakaye", "nisiye", "unaye", "uliye", "utakaye", "usiye", 
                                "tunao", "tulio", "tutakao", "tusio", "mnao", "mlio", "mtakao", "msio", "anaye", 
                                "aliye", "atakaye", "asiye", "wanao", "walio", "watakao", "wasio", "unao", "ulio", 
                                "utakao", "usio", "inayo", "iliyo", "itakayo", "isiyo", "linalo", "lililo", 
                                "litakalo", "lisilo", "yanayo", "yaliyo", "yatakayo", "yasiyo", "kinacho", 
                                "kilicho", "kitakacho", "kisicho", "vinavyo", "vilivyo", "vitakavyo", "visivyo", 
                                "inayo", "iliyo", "itakayo", "isiyo", "zinazo", "zilizo", "zitakazo", "zisizo", 
                                "unao", "ulio", "utakao", "usio", "unao", "ulio", "utakao", "usio", "kunako", 
                                "kuliko", "kutakako", "kusiko", "panapo", "palipo", "patakapo", "pasipo", "kunako", 
                                "kuliko", "kutakako", "kusiko", "mnamo", "mlimo", "mtakamo", "msimo"]




In [8]:
#Lists for verb prefix combinations

KNOWN_VERB_SUFFIX = ["ana", "isha", "lisha", "esha", "lesha"]

HAVE_VERB = ["nina", "sina", "una", "huna", "tuna", "hatuna", "mna", "hamna", "ana", "hana", "wana", "hawana", 
        "una", "hauna", "ina", "haina", "lina", "halina", "yana", "hayana", "kina", "hakina", "vina", 
        "havina", "ina", "haina", "zina", "hazina", "una", "hauna", "kuna", "hakuna", "pana", "hapana",
        "kuna", "hakuna", "mna", "hamna"]

CONINUATIVE_VERB = ["ningali", "ungali", "tungali", "mngali", "angali", "wangali", "ungali", "ingali", 
                    "lingali", "yangali", "kingali", "vingali", "ingali", "zingali", "ungali", "kungali", 
                    "pangali", "mngali"]

RELATIVE_VERB = ["niliye", "nisiye", "uliye", "usiye", "tulio", "tusio", "mlio", "msio", "aliye", "asiye", 
                 "walio", "wasio", "ulio", "usio", "iliyo", "isiyo", "lililo", "lisilo", "yaliyo", "yasiyo", 
                 "kilicho", "kisicho", "vilivyo", "visivyo", "iliyo", "isiyo", "zilizo", "zisizo", "ulio", 
                 "usio", "kuliko", "kusiko", "palipo", "pasipo", "mlimo", "msimo"]

LOCATION_VERB = ["nipo", "niko", "nimo", "sipo", "siko", "simo", "upo", "uko", "umo", "hupo", "huko", "humo",
                 "tupo", "tuko", "tumo", "hatupo", "hatuko", "hatumo", "mpo", "mko", "mmo", "hampo", "hamko", 
                 "hammo", "yupo", "yuko", "yumo", "hayupo", "hayuko", "hayumo", "wapo", "wako", "wamo", "hawapo", 
                 "hawako", "hawamo", "upo", "uko", "umo", "haupo", "hauko", "haumo", "ipo", "iko", "imo", "haipo", 
                 "haiko", "haimo", "lipo", "liko", "limo", "halipo", "haliko", "halimo", "yapo", "yako", "yamo", 
                 "hayapo", "hayako", "hayamo", "kipo", "kiko", "kimo", "hakipo", "hakiko", "hakimo", "vipo", 
                 "viko", "vimo", "havipo", "haviko", "havimo", "ipo", "iko", "imo", "haipo", "haiko", "haimo",
                 "zipo", "ziko", "zimo", "hazipo", "haziko", "hazimo", "upo", "uko", "umo", "haupo", "hauko", 
                 "haumo", "upo", "uko", "umo", "haupo", "hauko", "haumo", "kupo", "kuko", "kumo", "hakupo", 
                 "hakuko", "hakumo", "papo", "pako", "pamo", "hapapo", "hapako", "hapamo", "kupo", "kuko", 
                 "kumo", "hakupo", "hakuko", "hakumo", "mpo", "mko", "mmo", "hampo", "hamko", "hammo"]

COPULA_VERB = ["ni", "si", "ndimi", "ndiye", "simi", "siye", "ndiwe", "ndiye", "siwe", "siye", "ndisi", "ndio", 
               "sio", "ndinyi", "ndio", "sinyi", "sio", "ndiye", "siye", "ndio", "sio", "ndiyo", "siyo", "ndilo", 
               "silo", "ndiyo", "siyo", "ndicho", "sicho", "ndivyo", "sivyo", "ndiyo", "siyo", "ndizo", "sizo", 
               "ndio", "sio", "ndio", "sio", "ndiko", "siko", "ndipo", "sipo", "ndiko", "siko", "ndimo", "simo"]

VERB_EQUALITY = HAVE_VERB + CONINUATIVE_VERB + RELATIVE_VERB + LOCATION_VERB + COPULA_VERB

IRREALIS_VERB = ["nge", "singe", "ngali", "singali"]

POS_SUBJECT_NO_A = ["ni", "u", "tu", "m", "mw", "a", "yu", "wa", "i", "li", "ya", "ki", "vi", "zi", "ku", "pa", "mu"]

POS_SUBJECT_WITH_A = ["n", "w", "tw", "mw", "y", "l", "ch", "vy", "z", "kw", "p", "mw"]

NEG_SUBJECT = ["si", "hu", "hatu", "ham", "ha", "hayu", "hawa", "hau", "hai", "hali", "haya", "haki", "havi", "hazi", 
               "haku", "hapa", "hamu"]

POS_OBJECT = ["ni", "ku", "tu", "m", "mw", "wa", "u", "i", "li", "ya", "ki", "vi", "zi", "ku", "pa", "mu", "ji"]


RELATIVE = ["ye", "o", "yo", "lo", "cho", "vyo", "zo", "ko", "po", "mo"]

OBJ_REL = []

INDICITIVE_PAST = []

for rel in RELATIVE:
    for obj in POS_OBJECT:
        OBJ_REL.append(rel + obj)
        
for subj in POS_SUBJECT_NO_A:
    for ob in OBJ_REL:
        INDICITIVE_PAST.append(subj + "li" + ob)

INDICITIVE_PAST_NEG = []

for neg in NEG_SUBJECT:
    for obj in POS_OBJECT:
        INDICITIVE_PAST_NEG.append(neg + "ku" + obj)
        
INDICITIVE_PERFECT = []

for subj in POS_SUBJECT_NO_A:
    for obj in POS_OBJECT:
        INDICITIVE_PERFECT.append(subj + "me" + obj)
        
INDICITIVE_PERFECT_NEG = []

for neg in NEG_SUBJECT:
    for obj in POS_OBJECT:
        INDICITIVE_PERFECT_NEG.append(subj + "ja" + obj)
        
INDICITIVE_PRESENT_DEF = []

for subj in POS_SUBJECT_NO_A:
    for ob in OBJ_REL:
        INDICITIVE_PRESENT_DEF.append(subj + "na" + ob)

INDICITIVE_PRESENT_INDEF = []

for subj in POS_SUBJECT_WITH_A:
    for obj in POS_OBJECT:
        INDICITIVE_PRESENT_INDEF.append(subj + "a" + obj)
        
INDICITIVE_PRESENT_HAB = []

for obj in POS_OBJECT:
    INDICITIVE_PRESENT_HAB.append("hu" + obj)
    
INDICITIVE_PRESENT_NEGATIVE = []

for neg in NEG_SUBJECT:
    for obj in POS_OBJECT:
        INDICITIVE_PRESENT_NEGATIVE.append(neg + obj)
        
INDICITIVE_FUTURE = []

for subj in POS_SUBJECT_NO_A:
    for ob in OBJ_REL:
        INDICITIVE_FUTURE.append(subj + "taka" + ob)
        
INDICITIVE_FUTURE_NEGATIVE = []

for neg in NEG_SUBJECT:
    for obj in POS_OBJECT:
        INDICITIVE_FUTURE_NEGATIVE.append(neg + "ta" + obj)
        INDICITIVE_FUTURE_NEGATIVE.append(neg + "to" + obj)
        

INDICITIVE = INDICITIVE_PAST + INDICITIVE_PAST_NEG + INDICITIVE_PERFECT + INDICITIVE_PERFECT_NEG + INDICITIVE_PRESENT_DEF

INDICITIVE = INDICITIVE + INDICITIVE_PRESENT_INDEF + INDICITIVE_PRESENT_HAB + INDICITIVE_PRESENT_NEGATIVE + INDICITIVE_FUTURE + INDICITIVE_FUTURE_NEGATIVE

TENSELESS_RELATIVE = []

TENSELESS_REL = []

for subj in POS_SUBJECT_NO_A:
    for obj in POS_OBJECT:
        TENSELESS_REL.append(subj + obj)
        
TENSELESS_REL_NEG = []

for subj in POS_SUBJECT_NO_A:
    for ob in OBJ_REL:
        TENSELESS_REL_NEG.append(subj + "si" + ob)
        
TENSELESS_RELATIVE = TENSELESS_REL + TENSELESS_REL_NEG

CONTEXTUAL = []

SITUATIONAL = []

for subj in POS_SUBJECT_NO_A:
    for obj in POS_OBJECT:
        SITUATIONAL.append(subj + "ki" + obj)
    
CONSECUTIVE = []

for subj in POS_SUBJECT_NO_A:
    for obj in POS_OBJECT:
        CONSECUTIVE.append(subj + "ka" + obj)
    
CONTEXTUAL = SITUATIONAL + CONSECUTIVE

IMPERATIVE = []

SUBJUNCTIVE = []

for obj in POS_OBJECT:
    IMPERATIVE.append((obj, "e"))
    IMPERATIVE.append((obj, "eni"))
    IMPERATIVE.append(("ka" + obj, "e"))
    IMPERATIVE.append(("ka" + obj, "eni"))

for subj in POS_SUBJECT_NO_A:
    for obj in POS_OBJECT:
        SUBJUNCTIVE.append((subj + obj, "e"))
        SUBJUNCTIVE.append((subj + "si" + obj, "e"))
        SUBJUNCTIVE.append((subj + "ka" + obj, "e"))
        
IMPER_SUBJ = IMPERATIVE + SUBJUNCTIVE

In [9]:
#list for noun and adjective pairs where adjectives take on the prefix of the noun

NOUN_PREFIXES = {1: ("mw", "m"), 2: ("wa", "w"), 3: ("mw", "m"), 4: ("mi", "my"), 5: ("ji", "j"), 6: ("ma"), 
                7: ("ki", "ch"), 8: ("vi", "vy"), 9: ( "ny", "n"), 10: ("ny", "n"), 11: ("uw", "w", "u"), 
                    14: ("uw", "w", "u"), 15: ("ku", "kw")}

ADJECTIVE_PREFIXES = {1: ("mwe", "mwi", "m"), 2: ("wa", "we"), 3: ("mwe", "mwi", "m"),  4: ("mi", "mye"), 
                      5: ("ji", "je"), 6: ("ma", "me"), 7: ("ki", "che"), 8: ("vi", "vye"), 
                      9: ("ny", "nye", "mb", "nd", "ng", "nj", "mv", "mb", "nz"),   
                      10: ("nye", "ny", "mb", "nd", "ng", "nj", "mv", "mb", "nz"), 11: ("mwi", "mwe", "m"), 
                      14: ("mwi", "mwe", "m"), 15: ("kwi", "kwe", "ku")}

SHARED_PREFIX = {}

for i in range(1, 16):
    if i == 12 or i == 13:
        continue
    SHARED_PREFIX[i] = []
    for n_p in NOUN_PREFIXES[i]:
        for a_p in ADJECTIVE_PREFIXES[i]:
            SHARED_PREFIX[i].append((n_p, a_p))

In [10]:
# manually tagged most frequent words left over

MANUAL = {}

MANUAL["katika"] = "PREP"
MANUAL["kuwa"] = "VERB"
MANUAL["kama"] = "PREP"
MANUAL["yake"] = "PREP"
MANUAL["wake"] = "PREP"
MANUAL["baada"] = "PREP"
MANUAL["kutoka"] = "PREP"
MANUAL["wakati"] = "ADV"
MANUAL["nchini"] = "NOUN"
MANUAL["sasa"] = "ADV"
MANUAL["pia"] = "ADV"
MANUAL["au"] = "PREP"
MANUAL["kuhusu"] = "PREP"
MANUAL["serikali"] = "NOUN"
MANUAL["kila"] = "ADJ"
MANUAL["eneo"] = "NOUN"
MANUAL["moja"] = "NOUN"
MANUAL["sababu"] = "NOUN"
MANUAL["kazi"] = "NOUN"
MANUAL["yao"] = "PREP"
MANUAL["wao"] = "PREP"
MANUAL["kati"] = "PREP"
MANUAL["mara"] = "NOUN"
MANUAL["kufanya"] = "VERB"
MANUAL["hadi"] = "PREP"
MANUAL["kupitia"] = "PREP"
MANUAL["kisiasa"] = "ADJ"
MANUAL["nafasi"] = "NOUN"
MANUAL["nchi"] = "NOUN"
MANUAL["akasema"] = "VERB"
MANUAL["viongozi"] = "NOUN"
MANUAL["wengi"] = "ADJ"
MANUAL["uchaguzi"] = "NOUN"
MANUAL["kubwa"] = "ADJ"
MANUAL["mbalimbali"] = "ADJ"
MANUAL["dhidi"] = "ADV"
MANUAL["kabla"] = "ADV"
MANUAL["kaunti"] = "NOUN"
MANUAL["bila"] = "ADV"
MANUAL["tu"] = "ADV"
MANUAL["mtu"] = "NOUN"
MANUAL["shule"] = "NOUN"
MANUAL["wengine"] = "ADJ"
MANUAL["kutoa"] = "NOUN"
MANUAL["maeneo"] = "NOUN"
MANUAL["fedha"] = "NOUN"
MANUAL["zao"] = "PREP"
MANUAL["watoto"] = "NOUN"
MANUAL["njia"] = "NOUN"
MANUAL["wananchi"] = "NOUN"
MANUAL["huduma"] = "NOUN"
MANUAL["jamii"] = "NOUN"
MANUAL["taarifa"] = "NOUN"
MANUAL["dawa"] = "NOUN"
MANUAL["mkono"] = "NOUN"
MANUAL["juu"] = "PREP"
MANUAL["siasa"] = "NOUN"
MANUAL["chanjo"] = "NOUN"
MANUAL["kiongozi"] = "NOUN"
MANUAL["sheria"] = "NOUN"
MANUAL["habari"] = "NOUN"
MANUAL["mbali"] = "ADV"
MANUAL["virusi"] = "NOUN"
MANUAL["wetu"] = "PREP"
MANUAL["polisi"] = "NOUN"
MANUAL["tayari"] = "ADV"
MANUAL["mbili"] = "NOUN"
MANUAL["timu"] = "NOUN"
MANUAL["pesa"] = "NOUN"
MANUAL["chini"] = "ADV"
MANUAL["anasema"] = "VERB"
MANUAL["shughuli"] = "NOUN"
MANUAL["mchezo"] = "NOUN"
MANUAL["zake"] = "PREP"
MANUAL["bado"] = "ADV"
MANUAL["mechi"] = "NOUN"
MANUAL["nyingine"] = "ADJ"
MANUAL["leo"] = "ADV"
MANUAL["afya"] = "NOUN"
MANUAL["asilimia"] = "NOUN"
MANUAL["wiki"] = "NOUN"
MANUAL["kipindi"] = "NOUN"
MANUAL["wanafunzi"] = "NOUN"
MANUAL["ujenzi"] = "NOUN"
MANUAL["alama"] = "NOUN"
MANUAL["taifa"] = "NOUN"
MANUAL["kesi"] = "NOUN"
MANUAL["wanasiasa"] = "NOUN"
MANUAL["maafisa"] = "NOUN"
MANUAL["pili"] = "ADJ"
MANUAL["lugha"] = "NOUN"
MANUAL["nyumbani"] = "NOUN"
MANUAL["maendeleo"] = "NOUN"
MANUAL["urais"] = "NOUN"
MANUAL["kampuni"] = "NOUN"
MANUAL["mambo"] = "NOUN"
MANUAL["pekee"] = "ADJ"
MANUAL["tangu"] = "ADV"
MANUAL["jinsi"] = "NOUN"
MANUAL["wangu"] = "PREP"
MANUAL["ripoti"] = "NOUN"
MANUAL["mpya"] = "ADJ"
MANUAL["nyingi"] = "ADJ"
MANUAL["uwezo"] = "NOUN"
MANUAL["yetu"] = "PREP"
MANUAL["vizuri"] = "ADV"
MANUAL["wakazi"] = "NOUN"
MANUAL["hasa"] = "ADJ"
MANUAL["elimu"] = "NOUN"
MANUAL["mtoto"] = "NOUN"
MANUAL["masuala"] = "NOUN"
MANUAL["biashara"] = "NOUN"
MANUAL["msimu"] = "NOUN"
MANUAL["tena"] = "ADV"
MANUAL["tatu"] = "NOUN"
MANUAL["wawili"] = "NOUN"
MANUAL["yangu"] = "PREP"
MANUAL["jijini"] = "NOUN"
MANUAL["jambo"] = "NOUN"
MANUAL["bidhaa"] = "NOUN"
MANUAL["mswada"] = "NOUN"
MANUAL["nyumba"] = "NOUN"
MANUAL["kawaida"] = "ADV"
MANUAL["duniani"] = "NOUN"
MANUAL["changamoto"] = "NOUN"
MANUAL["ardhi"] = "NOUN"
MANUAL["saa"] = "NOUN"
MANUAL["mradi"] = "NOUN"
MANUAL["uhusiano"] = "NOUN"
MANUAL["kuu"] = "ADJ"
MANUAL["ushindi"] = "NOUN"
MANUAL["kadhaa"] = "ADJ"
MANUAL["fursa"] = "NOUN"
MANUAL["karibu"] = "ADV"
MANUAL["mfano"] = "NOUN"
MANUAL["vifaa"] = "NOUN"
MANUAL["kuanza"] = "VERB"
MANUAL["mahakama"] = "NOUN"
MANUAL["hatari"] = "NOUN"
MANUAL["familia"] = "NOUN"
MANUAL["kiasi"] = "NOUN"
MANUAL["mwezi"] = "NOUN"
MANUAL["matokeo"] = "NOUN"
MANUAL["dakika"] = "NOUN"
MANUAL["matumizi"] = "NOUN"
MANUAL["wanasema"] = "VERB"
MANUAL["sawa"] = "ADJ"
MANUAL["usiku"] = "NOUN"
MANUAL["kitaifa"] = "ADV"
MANUAL["kufikia"] = "VERB"
MANUAL["kwake"] = "PREP"
MANUAL["rasmi"] = "ADJ"
MANUAL["mdogo"] = "NOUN"
MANUAL["lengo"] = "NOUN"
MANUAL["usalama"] = "NOUN"
MANUAL["mpenzi"] = "NOUN"
MANUAL["rais"] = "NOUN"
MANUAL["idadi"] = "NOUN"
MANUAL["ugonjwa"] = "NOUN"
MANUAL["mazingira"] = "NOUN"
MANUAL["asema"] = "VERB"
MANUAL["mdogo"] = "NOUN"
MANUAL["lengo"] = "NOUN"
MANUAL["usalama"] = "NOUN"
MANUAL["mpenzi"] = "NOUN"
MANUAL["rais"] = "NOUN"
MANUAL["idadi"] = "NOUN"
MANUAL["ugonjwa"] = "NOUN"
MANUAL["mazingira"] = "NOUN"
MANUAL["asema"] = "VERB"
MANUAL["wazazi"] = "NOUN"
MANUAL["teknolojia"] = "NOUN"
MANUAL["sekta"] = "NOUN"
MANUAL["nini"] = "PREP"
MANUAL["janga"] = "NOUN"
MANUAL["maambukizi"] = "NOUN"
MANUAL["ndoa"] = "NOUN"
MANUAL["kuendelea"] = "VERB"
MANUAL["badala"] = "ADV"
MANUAL["mfumo"] = "NOUN"
MANUAL["uongozi"] = "NOUN"
MANUAL["bilioni"] = "NOUN"
MANUAL["sita"] = "NOUN"
MANUAL["moyo"] = "NOUN"
MANUAL["madai"] = "NOUN"
MANUAL["raia"] = "NOUN"
MANUAL["mama"] = "NOUN"
MANUAL["mke"] = "NOUN"
MANUAL["jina"] = "NOUN"
MANUAL["maji"] = "NOUN"
MANUAL["kushiriki"] = "VERB"
MANUAL["klabu"] = "NOUN"
MANUAL["soka"] = "NOUN"
MANUAL["mengi"] = "ADJ"
MANUAL["lake"] = "PREP"
MANUAL["kwenda"] = "VERB"
MANUAL["uchumi"] = "NOUN"
MANUAL["nguvu"] = "NOUN"
MANUAL["yeye"] = "PREP"
MANUAL["mashtaka"] = "NOUN"
MANUAL["msingi"] = "NOUN"
MANUAL["kikosi"] = "NOUN"
MANUAL["kuweka"] = "VERB"
MANUAL["wala"] = "PREP"
MANUAL["juhudi"] = "NOUN"
MANUAL["dunia"] = "NOUN"
MANUAL["mtandao"] = "NOUN"
MANUAL["chake"] = "PREP"
MANUAL["nyuma"] = "NOUN"

In [11]:
#function that iterates through every two words in a given set

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)


In [12]:
#create pandas dataframe that holds tokens, with punctuation and numbers included
#
tokens = re.findall(r"[\w']+|[.,!?;]", text)
df = pd.DataFrame()
df['Word'] = tokens

In [13]:
#takes in the text corpus and tags data, returning a dictionary formatted as 'token': 'tag'
def posTagger(text):
    tokens = re.findall(r"[\w']+|[.,!?;]", text)
    starting_size = len(tokens)
    print("Starting Text Corpus Size:", starting_size, "tokens")
    
    tags = {}

    #tags proper nouns
    for x, y in pairwise(tokens):
        if x not in string.punctuation and y[0].isupper():
            if y not in tags:
                tags[y] = "PROPN"
    
    #decreases size of token list for future loops by removing tagged data after tagging some data
    tokens = [t for t in tokens if t not in tags.keys()]
    proper = starting_size - len(tokens)
    print("Total Proper Nouns Marked:", len(tags))
            
    #all punctuation other than periods taking out with regular language when setting 'tokens' variable    
    tags["."] = "PUNCT"
    punct = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    punct -= len(tokens)
    print("Total Periods Marked:", punct)

    #numbers represented by digits, not with words
    for token in tokens:
        if token.isdigit():
            if token not in tags:
                tags[token] = "NUMBER"
                
    numbers = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    numbers -= len(tokens)
    print("Total Numbers Marked:", numbers)

    #prepositions and pronouns
    for token in tokens:
        if token.endswith("enye"):
            if token not in tags:
                tags[token] = "PREP"
            continue
        for prep in PREPOSITIONS:
            if token == prep:
                if token not in tags:
                    tags[token] = "PREP"
                break
    
    preps = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    preps -= len(tokens)
    print("Total Prepositions/Pronouns Marked:", preps)

    #determiners
    for token in tokens:
        if token.endswith("ote") or token.endswith("pi") or token.endswith("enyewe"):
            if token not in tags:
                tags[token] = "DETER"
            continue
        for d in DETERMINERS:
            if token == d:
                if token not in tags:
                    tags[token] = "DETER"
                break
            
    deter = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    deter -= len(tokens)
    print("Total Determiners Marked:", deter)

    #relative adjectives
    for token in tokens:
        for r_a in RELATIVE_ADJECTIVES_PREFIXES:
            if token.startswith(r_a):
                if token not in tags:
                    tags[token] = "ADJ"
                break
            
    rel_adj = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    rel_adj -= len(tokens)
    print("Total Relative Adjectives Marked:", rel_adj)

    #verbs with known suffixes
    for token in tokens:
        for k in KNOWN_VERB_SUFFIX:
            if token.endswith(k):
                if token not in tags:
                    tags[token] = "VERB"
                break

    tokens = [t for t in tokens if t not in tags.keys()]
    
    #irrealis verbs and other known verbs
    for token in tokens:
        for i in IRREALIS_VERB:
            if i in token:
                if token not in tags:
                    tags[token] = "VERB"
                break
            
            
        for v in VERB_EQUALITY:
            if token == v:
                if token not in tags:
                    tags[token] = "VERB"
                break  
            
    tokens = [t for t in tokens if t not in tags.keys()]

    #indicitive verbs
    for token in tokens:            
        for i in INDICITIVE:
            if token.startswith(i):
                if token not in tags:
                    tags[token] = "VERB"
                break
            
    tokens = [t for t in tokens if t not in tags.keys()]
            
    #relative and contextual verbs
    for token in tokens:
        temp = TENSELESS_RELATIVE + CONTEXTUAL
        for i in temp:
            if token.startswith(i) and not (token == i):
                if token not in tags:
                    tags[token] = "VERB"
                break
            
    tokens = [t for t in tokens if t not in tags.keys()]
        
    #imperative and subjunctive verbs
    for token in tokens:
        for i in IMPER_SUBJ:
            if token.startswith(i[0]) and token.endswith(i[1]) and len(token) > 4:
                if token not in tags:
                    tags[token] = "VERB"
                break


    verbs = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    verbs -= len(tokens)
    print("Total Verbs Marked:", verbs)

    #known concord pairs, where adjectives take on nouns
    marked = False
    for x, y in pairwise(tokens):
        if marked == True:
            marked = False
            continue
        for i in range(1, 16):
            if i == 12 or i == 13:
                continue
            if x.startswith(SHARED_PREFIX[i][0]) and y.startswith(SHARED_PREFIX[i][1]):
                if x not in tags:
                    tags[x] = "NOUN"
                if x not in tags:
                    tags[x] = "ADJ"
                marked = True
            break
        
    concord = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    concord -= len(tokens)
    print("Total Concord Noun-Adjectives Marked:", concord)

    #manually tagged data
    for token in tokens:
        if token in MANUAL:
            tags[token] = MANUAL[token]
        
    manual = len(tokens)
    tokens = [t for t in tokens if t not in tags.keys()]
    manual -= len(tokens)
    print("Total Words Manually Marked:", manual)

    print("Total Tokens Tagged:", starting_size - len(tokens), "tokens")
    return tags
    
    

In [14]:
#store tags in pandas dataframe as a new column called "PoS"
def createTraining():
    pos = []
    i = 0
    for token in tokens:
        pos.append("NaN")
        if token in TAGS:
            pos[i] = TAGS[token]
        i += 1
    df['PoS'] = pos
    with open('sw_training.txt', 'w+') as f:
        for word in TAGS:
            f.write("{word}\{tag} ".format(word = word, tag = TAGS[word]))
    print("sw_training.txt file created!")

    
    

In [15]:
TAGS = posTagger(text)

Starting Text Corpus Size: 543774 tokens
Total Proper Nouns Marked: 11127
Total Periods Marked: 17699
Total Numbers Marked: 7073
Total Prepositions/Pronouns Marked: 96208
Total Determiners Marked: 18683
Total Relative Adjectives Marked: 9266
Total Verbs Marked: 4962
Total Concord Noun-Adjectives Marked: 180
Total Words Manually Marked: 85888
Total Tokens Tagged: 434834 tokens


In [16]:
createTraining()

sw_training.txt file created!


In [17]:
print("Unique Swahili words tagged:", len(TAGS))
print("Lexical Diversity:", len(set(tokens))/len(tokens))

Unique Swahili words tagged: 31043
Lexical Diversity: 0.08383997763776863
