In [1]:
from nltk import *

###### import CategorizedSpanishReader

In [2]:
from nltk.corpus import CategorizedPlaintextCorpusReader

In [3]:
ctr = CategorizedPlaintextCorpusReader('./SetswanaCorpus2', '.*\.txt', cat_pattern='(Pol|Bus|Rel|Ref|Gra|Sci)/.*', encoding='UTF-16')

In [4]:
ctr.categories()

['Bus', 'Gra', 'Pol', 'Ref', 'Rel', 'Sci']

In [5]:
ref_wrds = ctr.words(categories='Ref')

In [6]:
pol_wrds = ctr.words(categories='Pol')

In [7]:
rsize = len(ref_wrds)

In [8]:
psize = len(pol_wrds)

In [9]:
rfd = FreqDist(ref_wrds)

In [10]:
pfd = FreqDist(pol_wrds)

In [11]:
#make a set with all the keys that are common to both freq_dists
cmn_voc = set(rfd.keys())|set(pfd.keys())

In [12]:
rset = set(rfd.keys())

In [13]:
pset = set(pfd.keys())

In [14]:
rpset = rset.intersection(pset)

### What stats?

In [15]:
import numpy as np
import scipy as sp
import matplotlib as mpl

In [16]:
import statsmodels

In [17]:
import statsmodels.api as sm

One package to analyze categorical data comes from the statsmodels package. 

[contingency tables](http://www.statsmodels.org/stable/contingency_tables.html)

The most used method is to construct a `Table` object from a simple array of data, and then use the methods defined for `Table` to do the statistics. E.g. `test_nominal_association()` will test using the chi-square statistic.

We can also get the odds ratio, which is a measure of association (the chi2 statistic is a measure of independence). For the odds ratio, a value of 1 signals no association (equal odds), but departures in either direction indicate association. The log odds removes the asymmetry, and gives 0 as a measure of no association.

The package has many other functions, including calculation of residuals. 

The other package is `scipy.stats`, which contains functions for tables: `chi2_contingency()` and `fisher_exact`.

The functions take an array-like object, and some other arguments, and return the statistics. By default, the chi2_contingency() function applies Yates correction, and uses Pearson's chi-square statistic. These parameters can be changed (corrrection=False to run without Yates, lambda_='pearson'/'log-likelihood'/'cressie-read'). 

fisher_exact reports the odds ratio, but not the log odds.

+ To find the **p-value**, use `sp.stats.chi2_contingency(data, correction=False, lambda_='log-likelihood')`
+ To find the **log odds**, first build a Table with `mytable = sm.stats.Table(vosotr)`, then do `mytable.local_log_oddsratios` (or `.cumulative_log_oddsratios`)

In [52]:
#Run a chi-square test to see if there is a significant difference
rslt_m = table_m.test_nominal_association()

In [53]:
#Run a chi-square test to see if there is a significant difference
rslt_lo = table_m.local_log_oddsratios

In [55]:
#A negative log_odds indicates an inverse association, i.e. more of the keyword in the
#religion corpus
print(rslt_lo[0][0])

-2.78004683648006


In [56]:
print(rslt_m)

df          1
pvalue      0.0003304788641766976
statistic   12.889227189414756


In [42]:
#"create an array with frequencies of a key word in each sub-corpus
key = 'kafa'
ttable=[[pfd[key],psize-pfd[key]],[rfd[key],rsize-rfd[key]]]
table_n = sm.stats.Table(ttable)
rslt_na = table_n.test_nominal_association()
rslt_lo = table_n.local_log_oddsratios
print(table_n, rslt_na, rslt_lo[0][0])

A 2x2 contingency table with counts:
[[2.000000e+00 3.994500e+04]
 [9.800000e+01 1.005098e+06]] df          1
pvalue      0.3418998683267588
statistic   0.9032944985618001 -0.6664834778676738


In [43]:
print(table_n)

A 2x2 contingency table with counts:
[[2.000000e+00 3.994500e+04]
 [9.800000e+01 1.005098e+06]]


In [44]:
tresult = sp.stats.chi2_contingency(ttable, correction=False, lambda_='log-likelihood')

In [45]:
print(tresult)

(1.0880349316064617, 0.29690701493265376, 1, array([[3.82215639e+00, 3.99431778e+04],
       [9.61778436e+01, 1.00509982e+06]]))


In [46]:
#find the actual count and the expected count
print(ttable[0][0], tresult[3][0][0])

2 3.822156393909733


In [25]:
#find words that are in one set but not in the other
excl_wrds = []
for k in rfd.keys():
    if not(k in pfd.keys()):
        excl_wrds.append(k)

In [27]:
print(excl_wrds[0:50])

['Oosi', 'Ditshupo', 'batshwantshi', 'Tshipi', 'eno', 'kgabisa', 'Holo', 'Ntlo', 'Kgolo', 'sebakeng', 'mebala', 'Difikantswe', 'pente', 'letseleng', 'matsela', 'kgabisitsweng', 'ikadile', 'boatlhamo', 'mabotana', 'ditshupo', 'mabutswa', 'setlabosheng', 'KETLOGETSWE', 'loeto', 'matshwenyego', 'Philip', 'Segola', 'mogolwane', 'lephateng', 'botshwantshi', 'tshwenyeng', 'motlhakga', 'latlhegile', 'tshekatshekong', 'Mox', 'Gotshwanetse', 'Velias', 'Ndaba', 'bofetshwana', 'tshwantshitse', 'phiri', 'tshetlha', 'tsatsi', 'dikela', 'ditshephe', 'fula', 'tswelelela', 'thiba', 'dikgoma', 'pitse']


In [28]:
pfd['fula']

0

In [47]:
#make a set with all the keys that are common to both freq_dists. Uncomment this option if trying to rule
#out words with zero counts in either corpus
#cmn_voc = set(rfd.keys()).intersection(set(pfd.keys()))

#make a set with all the keys that are in either freq_dists. Uncomment this option if trying to include
# words with zero counts in either corpus
cmn_voc = set(rfd.keys()).union(set(pfd.keys()))

#prepare a dict to store the results
tswana_keywords = {}

#Cycle through the common words
for key in cmn_voc:

#build a contingency table
    ttable = [[pfd[key],psize-pfd[key]],[rfd[key],rsize-rfd[key]]]

#Find the log-odds value
    lo_table = sm.stats.Table(ttable)
    lo = lo_table.local_log_oddsratios[0][0]

#Find the p-value using the log-likelihood test
    ll = sp.stats.chi2_contingency(ttable, correction=False, lambda_='log-likelihood')
    
#Select those words with a significant p-value and store them in the dict, with the p-value
#and the log-odds value as values
#    if ll[1] < 0.05:
#        tswana_keywords[key] = (ll[1], lo)

#OR: store words with lo and ll values
#and also expected values
    tswana_keywords[key] = (ll[1], lo, ll[3][0][0])

#But why select according to the p-value? Why not select according to LO?

In [52]:
#Get the keys with a p-value lower than 0.000001 such that the actual frequency in the target
#corpus is higher than the expected frequency

tswana_ll_keywords = {}
for k in tswana_keywords.keys():
    p = tswana_keywords[k][0]
    e = tswana_keywords[k][2]
    if (p < 0.000001) and (pfd[k] > e):
        tswana_ll_keywords[k] = (p, pfd[k], e, rfd[k])            

In [53]:
sorted(tswana_ll_keywords.items(), key= lambda x : x[1][0])

[('Ie', (1.26460101467272e-295, 898, 208.57507441565411, 4559)),
 ('SE', (5.77900424496046e-100, 77, 3.248832934823273, 8)),
 ('BNF', (1.40436210068122e-97, 86, 4.319036725117998, 27)),
 ('Committee', (3.6438973593835566e-85, 64, 2.6372879117977157, 5)),
 ('Comrade', (9.267006939414969e-78, 56, 2.216850708467645, 2)),
 ('Botswana', (5.006394411223483e-74, 175, 32.48832934823273, 675)),
 ('2016', (1.0526050046580421e-65, 70, 4.701252364508972, 53)),
 ('Congress', (1.8889934098808472e-55, 41, 1.6817488133202825, 3)),
 ('ANC', (1.8925196490978686e-52, 41, 1.7964135051375745, 6)),
 ('bophasalatsi', (2.8351358900828035e-51, 38, 1.5670841215029905, 3)),
 ('Go', (2.214548525987668e-49, 271, 98.07653306772374, 2295)),
 ('Koma', (5.2216707130539377e-48, 40, 1.9110781969548665, 10)),
 ('Central', (1.885337806971239e-47, 36, 1.5288625575638932, 4)),
 ('sechaba', (1.1072927381681717e-46, 43, 2.3697369642240345, 19)),
 ('National', (5.815664058644259e-46, 46, 2.8666172954322997, 29)),
 ('Aretikele'

In [65]:
#Get the keys with a positive LO value, higher than an arbitrary treshold (0.5?)

tswana_lo_keywords = {}
for k in tswana_keywords.keys():
    o = tswana_keywords[k][1]
    if (o > 4):
        tswana_lo_keywords[k] = (o, pfd[k], rfd[k])            

In [66]:
sorted(tswana_lo_keywords.items(), key= lambda x : x[1][0], reverse=True)

[('Aretikele', (7.320480090257284, 30, 0)),
 ('SECHABA', (7.054601576018673, 23, 0)),
 ('Constituency', (7.010124766171106, 22, 0)),
 ('Concerned', (6.963579703886827, 21, 0)),
 ('Cancus', (6.914764493695328, 20, 0)),
 ('TIwe', (6.752170429894953, 17, 0)),
 ('Parliamentary', (6.626957200541895, 15, 0)),
 ('Comrade', (6.558989612322749, 56, 2)),
 ('MDDA', (6.557939286796131, 14, 0)),
 ('Tebe', (6.483806273010698, 13, 0)),
 ('Executive', (6.361453400627374, 23, 1)),
 ('Bomorafe', (6.316702106965273, 11, 0)),
 ('sechabeng', (6.221366887410342, 10, 0)),
 ('dikgololosego', (6.115981332628886, 9, 0)),
 ('Parliamentory', (5.998173258475811, 8, 0)),
 ('Coordinator', (5.998173258475811, 8, 0)),
 ('eb', (5.864791111404832, 14, 1)),
 ('Ditšhaba', (5.864616827981509, 7, 0)),
 ('GOLA', (5.864616827981509, 7, 0)),
 ('region', (5.864616827981509, 7, 0)),
 ('Khonferenseng', (5.864616827981509, 7, 0)),
 ('Congress', (5.841367931362748, 41, 3)),
 ('Committee', (5.776427855171465, 64, 5)),
 ('bophasalats

In [22]:
#Sort the items in the dict according to their value, and decide which list they
#belong in

#first build two empty dicts for politics and the reference corpus keywords

tswana_p_keywords = {}
tswana_r_keywords = {}

#Next, take the keys and compare their Log-Odds values to a threshold.
for kw in tswana_keywords.keys():
#If LO value is positive, store the key in the target dictionary
    if tswana_keywords[kw][1] > 0:
        tswana_p_keywords[kw] = tswana_keywords[kw]
#If negative, store the key in the reference dictionary
    else:
        tswana_r_keywords[kw] = tswana_keywords[kw]

In [23]:
#Sort target keywords according to LO value

#from operator import itemgetter
sorted(tswana_p_keywords.items(), key= lambda x : x[1][1])

[('e', (2.678312845594523e-10, 0.18572101451623269)),
 ('mo', (3.6690065785236385e-08, 0.22175118171060326)),
 ('nna', (0.008187821571953826, 0.23008338198197187)),
 ('gore', (2.3518348789689515e-10, 0.318763118941888)),
 ('go', (1.9016599110532382e-38, 0.33514568504168984)),
 ('ya', (3.232289211356065e-26, 0.35502556432879473)),
 ('tiro', (0.039364309820541246, 0.37303345410084976)),
 ('(', (4.050970694717927e-06, 0.3770835280488196)),
 ('yone', (0.003912200468109146, 0.380672004161446)),
 ('dilo', (0.02628331771722996, 0.38224755868311444)),
 ('tsa', (1.4520398501168806e-14, 0.38669146767920104)),
 (').', (0.039790846581792494, 0.41022827410450535)),
 ('pele', (0.0015999112043193534, 0.44032877239116974)),
 ('one', (0.01090218809853443, 0.4422753760613434)),
 ('thata', (0.00017276473804811806, 0.4662718125141305)),
 ('mongwe', (6.157972116400123e-05, 0.4746723239285773)),
 ('di', (1.0255584257479374e-27, 0.48799541972769767)),
 ('jo', (0.0013281903692903179, 0.4984172018618178)),
 ('

In [64]:
#Sort target keywords according to p value

#from operator import itemgetter
sorted(tswana_p_keywords.items(), key= lambda x : x[1][0])

[('Ie', (1.26460101467272e-295, 1.6188862965093325)),
 ('SE', (5.77900424496046e-100, 5.491669586455961)),
 ('BNF', (1.40436210068122e-97, 4.3860229935318795)),
 ('Committee', (3.6438973593835566e-85, 5.776427855171465)),
 ('Comrade', (9.267006939414969e-78, 6.558989612322749)),
 ('Botswana', (5.006394411223483e-74, 1.879176225892964)),
 ('2016', (1.0526050046580421e-65, 3.5052887108385584)),
 ('Congress', (1.8889934098808472e-55, 5.841367931362748)),
 ('ANC', (1.8925196490978686e-52, 5.148217766296867)),
 ('bophasalatsi', (2.8351358900828035e-51, 5.765306850545289)),
 ('Go', (2.214548525987668e-49, 1.0935363067033617)),
 ('Koma', (5.2216707130539377e-48, 4.612670492010978)),
 ('Central', (1.885337806971239e-47, 5.423506449235202)),
 ('sechaba', (1.1072927381681717e-46, 4.043203491418456)),
 ('National', (5.815664058644259e-46, 3.687853156107066)),
 ('tse', (3.900125795777096e-43, 0.7452304837778545)),
 ('/', (2.1269216277906345e-41, 1.1543462189640818)),
 ('Tebelopele', (6.85690758926

In [65]:
#Sort Reference keywords according to l-o value

#from operator import itemgetter
sorted(tswana_r_keywords.items(), key= lambda x : x[1][1])

[('."', (1.4113895346527211e-25, -4.109791358466299)),
 ('me', (1.4140635604034731e-14, -2.949307279445528)),
 ('Rre', (6.935126716518967e-08, -2.9341567699229767)),
 ('sekao', (9.403620206157624e-08, -2.9170908246564853)),
 ('ile', (2.370492025465585e-06, -2.7151400756093773)),
 ('Ee', (6.3390546980302406e-06, -2.6442398609733315)),
 ('twe', (1.2991235792891966e-05, -2.589054571398078)),
 ('sala', (2.2872258436666318e-05, -2.543230108040472)),
 ('fitlha', (3.5940694286529226e-05, -2.5049969535196555)),
 ('Iwa', (9.915461801237385e-05, -2.413224971574822)),
 ('<', (4.497180734254686e-24, -2.3873868459700027)),
 ('botlhoko', (0.00013896909204999105, -2.380663412595827)),
 ('kile', (0.00016144451107646252, -2.3658443464180996)),
 ('ena', (0.00020983382794504952, -2.3393708075810347)),
 ('koo', (0.00039619904638763865, -2.272001709177638)),
 ('yoo', (0.00039619904638763865, -2.272001709177638)),
 ('kete', (0.0006199224248584333, -2.2215589147532198)),
 ('Morago', (0.0006434610353920629, -