## Keyness Analysis of American vs. Peninsular Spanish

Author: Nicole Dodd

In [106]:
import os, re, statsmodels
import pandas as pd
import statsmodels.api as sm
import scipy as sp
import matplotlib as mpl
from nltk.corpus import SpanishReader
from nltk import FreqDist
from collections import defaultdict
from math import log

### Dialectal keyness

In [76]:
## load corpora files

american_corpus = []
peninsular_corpus = []

punct = ['-', '--', '\'', '\"', ';', '[', '—', '✚', '§', '#', '´', '<', '>', '+',
        '’', '=', ':', '$', ',', '“', '”', '»', '«', '"', ';', '¿', '?', '¡', '!',
        '_', '.', '(', ')', '[', ']', '{', '}', '*', '^', '-', ']', '...']

corpora_dir = 'C:/Users/nicol/OneDrive/Documents/Education/Graduate - UCD/2020 Summer/GSR Spanish Corpus/corpora-files/'
corpora_files = os.listdir(corpora_dir)

for file in corpora_files:
    myspreader = SpanishReader.SpanishPlaintextCorpusReader(corpora_dir, file)
    text = myspreader.words()
    if re.search('a-xix-', file) or re.search('a-xx-', file):
        for t in text:
            if t not in punct and t.isdigit() == False: # removes punctuation and numbers from analysis
                american_corpus.append(t.lower()) # lower case words for freq dist
    else: # if peninsular
        for t in text:
            if t not in punct and t.isdigit() == False: # removes punctuation and numbers from analysis
                peninsular_corpus.append(t.lower()) # lower case words for freq dist

In [77]:
## get corpora sizes

am_corp_n = len(american_corpus)
print(am_corp_n)
pen_corp_n = len(peninsular_corpus)
print(pen_corp_n)
total_corp = am_corp_n + pen_corp_n
print(total_corp)

1614703
1419469
3034172


In [78]:
## get unique tokens and type count
am_vocab = set(american_corpus)
pen_vocab = set(peninsular_corpus)

am_corp_type = len(am_vocab)
print(am_corp_type)
pen_corp_type = len(pen_vocab)
print(pen_corp_type)

102240
83683


In [79]:
## get freq distributions

am_fd = FreqDist(american_corpus)
pen_fd = FreqDist(peninsular_corpus)

In [80]:
## get common vocabulary

span_vocab = list(am_vocab.intersection(pen_vocab))

In [81]:
## build table of freq distributions of common vocabulary

corpora_freq = defaultdict(list) # initialize dict with list for values

# initialize with freqs from American corpus
for (key, value) in am_fd.items(): 
    if key in span_vocab:
            corpora_freq[key].append(value)

# add freqs from Peninsular corpus
for (key, value) in pen_fd.items():
    if key in span_vocab:
            corpora_freq[key].append(value)

# convert to dataframe
corpora_freq_df = pd.DataFrame.from_dict(corpora_freq, orient = 'index', columns = ['American', 'Peninsular'])
print(corpora_freq_df)

           American  Peninsular
don            2344        1902
pedro           465         284
de            91615       81677
lara              5           5
doña            866         795
...             ...         ...
coloco            1           2
apuraré           1           1
intercede         1           1
recetas           1           1
epidemias         1           1

[45816 rows x 2 columns]


### Stats
**chi-squared statistic:** Determines whether there is a statistically significant difference between the *expected* frequencies and *observed* frequencies in one or more categories of a contingency table. A measure of independence.<br><br>
**log likelihood ratio:** Determines whether the distribution of A given B is the same as the distribution if A without B. If the words A and B occur independently, then we would expect *p(AB)* = *p(A)p(B)*. A measure of independence.<br><br>
**odds ratio:** Determines the strength of association or non-independence between data values. A measure of correlation.<br><br>
**Kullback-Leibler (KL) divergence:** Quantifies the difference between two probability distributions. Less conflated with frequency than LLR (Gries forthcoming).<br><br>

### American dialectal keyness

In [82]:
## run suite of keyness tests

# initialize dicts to gather data
am_pearson_chi = {}
am_log_likelihood = {}
am_odds_ratio = {}

for word in span_vocab:
    
    # build contingency table
    am_cont_table = [[am_fd[word], am_corp_n - am_fd[word]], [pen_fd[word], pen_corp_n - pen_fd[word]]]
    am_cont_table_stats = sm.stats.Table(am_cont_table)
    
    # get Pearson chi-squared statistic
    chi = am_cont_table_stats.test_nominal_association()
    am_pearson_chi[word] = [chi.statistic, chi.pvalue]
    
    # compare to same measure with scipy (note: same values as with statsmodels as long as correction = False)
    am_sp_chi = sp.stats.chi2_contingency(am_cont_table, correction = False)
    
    # get log likelihood ratio
    log = sp.stats.chi2_contingency(am_cont_table, correction = False, lambda_ = 'log-likelihood')
    am_log_likelihood[word] = [log[0], log[1], log[2]]
    
    # get odds ratio
    odds = am_cont_table_stats.local_log_oddsratios
    am_odds_ratio[word] = odds[0][0]

In [83]:
## convert dicts to dfs

am_pearson_chi_df = pd.DataFrame.from_dict(am_pearson_chi, orient = 'index', columns = ['chi-statistic', 'pvalue'])
am_log_likelihood_df = pd.DataFrame.from_dict(am_log_likelihood, orient = 'index', columns = ['chi-statistic', 
                                                                                        'pvalue', 
                                                                                        'deg-of-freedom'])
am_odds_ratio_df = pd.DataFrame.from_dict(am_odds_ratio, orient = 'index', columns = ['odds'])

#### Pearson chi-statistic

In [84]:
print(am_pearson_chi_df)

              chi-statistic    pvalue
irregulares        0.871869  0.350438
asesina            0.278217  0.597873
estragado          1.279240  0.258040
evite              3.219605  0.072761
precio            10.360744  0.001287
...                     ...       ...
padrino            3.923273  0.047622
convenciones       2.185802  0.139288
mandobles          0.350845  0.553635
dés                0.008315  0.927344
letras             5.264201  0.021768

[45816 rows x 2 columns]


In [85]:
## filter for significant results

am_pearson_chi_sig = am_pearson_chi_df[am_pearson_chi_df['pvalue'] <= 0.05]
print(am_pearson_chi_sig.sort_values('pvalue'))

           chi-statistic    pvalue
alicia        101.360102  0.000000
cap.           89.466362  0.000000
apolonio      197.445153  0.000000
inés          108.541734  0.000000
barcelona      71.062309  0.000000
...                  ...       ...
ofendido        3.857771  0.049516
presto          3.857771  0.049516
estarán         3.857771  0.049516
santuario       3.857771  0.049516
eres            3.855227  0.049591

[6052 rows x 2 columns]


In [86]:
## export to Excel

am_pearson_chi_sig.to_excel('am-pearson-chi-sig.xlsx')

#### Log likelihood

In [87]:
print(am_log_likelihood_df)

              chi-statistic    pvalue  deg-of-freedom
irregulares        0.893510  0.344528               1
asesina            0.277550  0.598312               1
estragado          1.320830  0.250443               1
evite              3.451404  0.063198               1
precio            10.356034  0.001291               1
...                     ...       ...             ...
padrino            4.077198  0.043466               1
convenciones       2.420457  0.119760               1
mandobles          0.350968  0.553566               1
dés                0.008298  0.927419               1
letras             5.330527  0.020955               1

[45816 rows x 3 columns]


In [88]:
## filter for significant results

am_log_likelihood_sig = am_log_likelihood_df[am_log_likelihood_df['pvalue'] <= 0.05]
print(am_log_likelihood_sig.sort_values('pvalue'))

           chi-statistic         pvalue  deg-of-freedom
á            2431.760738   0.000000e+00               1
i            2389.914930   0.000000e+00               1
do           1199.566146  7.578579e-263               1
y             947.343192  5.014976e-208               1
nieves        699.911134  3.126286e-154               1
...                  ...            ...             ...
furiosos        3.848933   4.977764e-02               1
cantares        3.848933   4.977764e-02               1
insensato       3.848933   4.977764e-02               1
cuadro          3.846686   4.984440e-02               1
eres            3.845578   4.987733e-02               1

[6484 rows x 3 columns]


In [89]:
## export to Excel

am_log_likelihood_sig.to_excel('am-log-likelihood-sig.xlsx')

#### Odds ratio

In [90]:
print(am_odds_ratio_df)

                  odds
irregulares   0.564281
asesina      -0.352013
estragado    -1.227482
evite        -1.738309
precio       -0.495521
...                ...
padrino       0.764957
convenciones  1.480572
mandobles    -0.534334
dés          -0.128868
letras        0.352125

[45816 rows x 1 columns]


In [91]:
## filter for positive log odds

am_odds_ratio_pos = am_odds_ratio_df[am_odds_ratio_df['odds'] >= 0]
print(am_odds_ratio_pos.sort_values('odds'))

                odds
teatros     0.000344
volvían     0.000344
tiró        0.000344
importaba   0.000344
cubiertas   0.000344
...              ...
bruno       4.719405
leonardo    4.937081
villaverde  4.952635
baldomero   5.287371
méxico      5.326597

[18589 rows x 1 columns]


In [92]:
## export to Excel

am_odds_ratio_pos.to_excel('am-odds-ratio-pos.xlsx')

### Peninsular dialectal keyness

In [93]:
## run suite of keyness tests

# initialize dicts to gather data
pen_pearson_chi = {}
pen_log_likelihood = {}
pen_odds_ratio = {}

for word in span_vocab:
    
    # build contingency table
    pen_cont_table = [[pen_fd[word], pen_corp_n - pen_fd[word]], [am_fd[word], am_corp_n - am_fd[word]]]
    pen_cont_table_stats = sm.stats.Table(pen_cont_table)
    
    # get Pearson chi-squared statistic
    chi = pen_cont_table_stats.test_nominal_association()
    pen_pearson_chi[word] = [chi.statistic, chi.pvalue]
    
    # compare to same measure with scipy (note: same values as with statsmodels as long as correction = False)
    pen_sp_chi = sp.stats.chi2_contingency(pen_cont_table, correction = False)
    
    # get log likelihood ratio
    log = sp.stats.chi2_contingency(pen_cont_table, correction = False, lambda_ = 'log-likelihood')
    pen_log_likelihood[word] = [log[0], log[1], log[2]]
    
    # get odds ratio
    odds = pen_cont_table_stats.local_log_oddsratios
    pen_odds_ratio[word] = odds[0][0]

In [94]:
## convert dicts to dfs

pen_pearson_chi_df = pd.DataFrame.from_dict(pen_pearson_chi, orient = 'index', columns = ['chi-statistic', 'pvalue'])
pen_log_likelihood_df = pd.DataFrame.from_dict(pen_log_likelihood, orient = 'index', columns = ['chi-statistic', 
                                                                                        'pvalue', 
                                                                                        'deg-of-freedom'])
pen_odds_ratio_df = pd.DataFrame.from_dict(pen_odds_ratio, orient = 'index', columns = ['odds'])

#### Pearson chi-statistic

In [95]:
print(pen_pearson_chi_df)

              chi-statistic    pvalue
irregulares        0.871869  0.350438
asesina            0.278217  0.597873
estragado          1.279240  0.258040
evite              3.219605  0.072761
precio            10.360744  0.001287
...                     ...       ...
padrino            3.923273  0.047622
convenciones       2.185802  0.139288
mandobles          0.350845  0.553635
dés                0.008315  0.927344
letras             5.264201  0.021768

[45816 rows x 2 columns]


In [96]:
## filter for significant results

pen_pearson_chi_sig = pen_pearson_chi_df[pen_pearson_chi_df['pvalue'] <= 0.05]
print(pen_pearson_chi_sig.sort_values('pvalue'))

           chi-statistic    pvalue
alicia        101.360102  0.000000
cap.           89.466362  0.000000
apolonio      197.445153  0.000000
inés          108.541734  0.000000
barcelona      71.062309  0.000000
...                  ...       ...
ofendido        3.857771  0.049516
presto          3.857771  0.049516
estarán         3.857771  0.049516
santuario       3.857771  0.049516
eres            3.855227  0.049591

[6052 rows x 2 columns]


In [97]:
## export to Excel

pen_pearson_chi_sig.to_excel('pen-pearson-chi-sig.xlsx')

#### Log likelihood

In [98]:
print(pen_log_likelihood_df)

              chi-statistic    pvalue  deg-of-freedom
irregulares        0.893510  0.344528               1
asesina            0.277550  0.598312               1
estragado          1.320830  0.250443               1
evite              3.451404  0.063198               1
precio            10.356034  0.001291               1
...                     ...       ...             ...
padrino            4.077198  0.043466               1
convenciones       2.420457  0.119760               1
mandobles          0.350968  0.553566               1
dés                0.008298  0.927419               1
letras             5.330527  0.020955               1

[45816 rows x 3 columns]


In [99]:
## filter for significant results

pen_log_likelihood_sig = pen_log_likelihood_df[pen_log_likelihood_df['pvalue'] <= 0.05]
print(pen_log_likelihood_sig.sort_values('pvalue'))

           chi-statistic         pvalue  deg-of-freedom
á            2431.760738   0.000000e+00               1
i            2389.914930   0.000000e+00               1
do           1199.566146  7.578579e-263               1
y             947.343192  5.014976e-208               1
nieves        699.911134  3.126286e-154               1
...                  ...            ...             ...
furiosos        3.848933   4.977764e-02               1
cantares        3.848933   4.977764e-02               1
insensato       3.848933   4.977764e-02               1
cuadro          3.846686   4.984440e-02               1
eres            3.845578   4.987733e-02               1

[6484 rows x 3 columns]


In [100]:
## export to Excel

pen_log_likelihood_sig.to_excel('pen-log-likelihood-sig.xlsx')

#### Odds ratio

In [101]:
print(pen_odds_ratio_df)

                  odds
irregulares  -0.564281
asesina       0.352013
estragado     1.227482
evite         1.738309
precio        0.495521
...                ...
padrino      -0.764957
convenciones -1.480572
mandobles     0.534334
dés           0.128868
letras       -0.352125

[45816 rows x 1 columns]


In [102]:
## filter for positive log odds

pen_odds_ratio_pos = pen_odds_ratio_df[pen_odds_ratio_df['odds'] >= 0]
print(pen_odds_ratio_pos.sort_values('odds'))

              odds
dignidad  0.000035
ardiente  0.000035
suerte    0.000327
grito     0.000390
dió       0.000487
...            ...
batiste   5.354745
azorín    5.499657
gray      5.763854
leto      6.110561
regina    6.197598

[27227 rows x 1 columns]


In [103]:
## export to Excel

pen_odds_ratio_pos.to_excel('pen-odds-ratio-pos.xlsx')

### KL divergence

In [124]:
def kl_contingency(span_vocab):
    
    output = {}
    
    for word in span_vocab:
    
        word_freq = am_fd[word] + pen_fd[word]
        total_corpora = am_corp_n + pen_corp_n
        
        # probability of am corpus given word
        p_am_word = am_fd[word] / word_freq
        # probability of pen corpus given word
        p_pen_word = pen_fd[word] / word_freq
        # probability of target corpus (am) given corpora
        p_t_corpora = am_corp_n / total_corpora
        # probability of reference corpus given corpora
        p_r_corpora = pen_corp_n / total_corpora

        kl_cont_table = [[(am_fd[word] / word_freq), # probability of am corpus given word
                          (pen_fd[word] / word_freq)], # probability of pen corpus given word
                         [(am_corp_n / total_corpora), # probability of target corpus given corpora
                          (pen_corp_n / total_corpora)]] # probability of reference corpus given corpora
        
        kl = (p_am_word * log((p_am_word / p_t_corpora), 2)) + (p_pen_word * log((p_pen_word / p_r_corpora), 2))
        
        output[word] = kl
        
    return output
        

In [125]:
## get KL divergence

kl_divergence = kl_contingency(span_vocab)

In [126]:
## convert dict to df

kl_divergence_df = pd.DataFrame.from_dict(kl_divergence, orient = 'index', columns = ['kl-divergence'])
print(kl_divergence_df)

              kl-divergence
irregulares        0.053711
asesina            0.022245
estragado          0.238194
evite              0.414943
precio             0.043683
...                     ...
padrino            0.094873
convenciones       0.290998
mandobles          0.050634
dés                0.002993
letras             0.021601

[45816 rows x 1 columns]


In [127]:
## print sorted

print(kl_divergence_df.sort_values('kl-divergence', ascending = False))

           kl-divergence
regina      1.071967e+00
leto        1.070108e+00
gray        1.061217e+00
azorín      1.052525e+00
batiste     1.046905e+00
...                  ...
tiró        2.119619e-08
importaba   2.119619e-08
suerte      1.921486e-08
ardiente    2.238996e-10
dignidad    2.238996e-10

[45816 rows x 1 columns]


In [128]:
## export to Excel

kl_divergence_df.to_excel('kl-divergence.xlsx')

### Next steps:
Incorporate text dispersion as a measure of frequency