## Usage of tu y usted, vosotros y ustedes in American and Peninsular Spanish

Author: Nicole Dodd

In [2]:
import sys, os, re
from nltk.corpus import SpanishReader
import pandas as pd

### Overall analysis of American vs. Peninsular usage

In [3]:
## load corpora files

american_corpus = []
peninsular_corpus = []

corpora_dir = 'C:/Users/nicol/OneDrive/Documents/Education/Graduate - UCD/2020 Summer/GSR Spanish Corpus/corpora-files/'
corpora_files = os.listdir(corpora_dir)

for file in corpora_files:
    myspreader = SpanishReader.SpanishPlaintextCorpusReader(corpora_dir, file)
    text = myspreader.words()
    if re.search('a-xix-', file) or re.search('a-xx-', file):
        for t in text:
            american_corpus.append(t)
    else: # if peninsular
        for t in text:
            peninsular_corpus.append(t)

In [4]:
## Get size of corpora

a_corp_n = len(american_corpus)
print(a_corp_n)
p_corp_n = len(peninsular_corpus)
print(p_corp_n)

1922409
1686552


In [5]:
## Search for ust/vos in corpora

def count_word(word, corpus):
    count = 0
    for c in corpus:
        if c.lower() == word:
            count += 1
    
    return count

tu = ['tu']
usted = ['usted', 'vd.', 'ud.']
vosotros = ['vosotros', 'vosotras']
ustedes = ['ustedes', 'vds.']
yo = ['yo']
nosotros = ['nosotros', 'nosotras', 'nos.']

a_tu = 0
p_tu = 0
a_usted = 0
p_usted = 0
a_vosotros = 0
p_vosotros = 0
a_ustedes = 0
p_ustedes = 0
a_yo = 0
p_yo = 0
a_nosotros = 0
p_nosotros = 0

for word in tu:
    a_tu += count_word(word, american_corpus)
    p_tu += count_word(word, peninsular_corpus)

for word in usted:
    a_usted += count_word(word, american_corpus)
    p_usted += count_word(word, peninsular_corpus)

for word in vosotros:
    a_vosotros += count_word(word, american_corpus)
    p_vosotros += count_word(word, peninsular_corpus)

for word in ustedes:
    a_ustedes += count_word(word, american_corpus)
    p_ustedes += count_word(word, peninsular_corpus)
    
for word in yo:
    a_yo += count_word(word, american_corpus)
    p_yo += count_word(word, peninsular_corpus)
    
for word in nosotros:
    a_nosotros += count_word(word, american_corpus)
    p_nosotros += count_word(word, peninsular_corpus)

In [6]:
## Get total counts

counts_dict = {'American': [a_tu, a_usted, a_vosotros, a_ustedes, a_yo, a_nosotros], 
               'Peninsular': [p_tu, p_usted, p_vosotros, p_ustedes, p_yo, p_nosotros]}

total_counts = pd.DataFrame(counts_dict, index = ['tu', 'usted', 'vosotros', 'ustedes', 'yo', 'nosotros'] )

print(total_counts)
print('\nSize of American corpus: ' + str(a_corp_n))
print('\nSize of Peninsular corpus: ' + str(p_corp_n) +'\n')
print(total_counts.sum())

          American  Peninsular
tu            1176         782
usted         2250        3117
vosotros        97          90
ustedes        377         329
yo            4442        4016
nosotros       822         636

Size of American corpus: 1922409

Size of Peninsular corpus: 1686552

American      9164
Peninsular    8970
dtype: int64


In [7]:
## Calculate proportional usage

a_sing = a_tu + a_usted
a_plur = a_vosotros + a_ustedes
p_sing = p_tu + p_usted
p_plur = p_vosotros + p_ustedes

prop_dict_ampen = {'American': [round((a_tu/a_sing), 2), 
                          round((a_usted/a_sing), 2),
                          round((a_vosotros/a_plur), 2),
                          round((a_ustedes/a_plur), 2)],  
             'Peninsular': [round((p_tu/p_sing), 2), 
                            round((p_usted/p_sing), 2), 
                            round((p_vosotros/p_plur), 2), 
                            round((p_ustedes/p_plur), 2)]}

proportions = pd.DataFrame(prop_dict_ampen, index = ['tu:sing', 'usted:sing', 'vosotros:plur', 'ustedes:plur'])

print(proportions)

               American  Peninsular
tu:sing            0.34        0.20
usted:sing         0.66        0.80
vosotros:plur      0.20        0.21
ustedes:plur       0.80        0.79


### Analysis of newspaper vs. other domains

In [8]:
## load corpora files

am_noticias_corpus = []
am_other_corpus =[]
pen_noticias_corpus = []
pen_other_corpus = []

for file in corpora_files:
    myspreader = SpanishReader.SpanishPlaintextCorpusReader(corpora_dir, file)
    text = myspreader.words()
    if re.search('a-xix-', file) or re.search('a-xx-', file):
        if re.search('a-xix-j-', file):
            for t in text:
                am_noticias_corpus.append(t)
        else: # if anything other than noticias
            for t in text:
                am_other_corpus.append(t)
    else: # if peninsular
        if re.search('p-xix-j-', file):
            for t in text:
                pen_noticias_corpus.append(t)
        else: # if anything other than noticias
            for t in text:
                pen_other_corpus.append(t)

In [9]:
## Get size of corpora

a_not_n = len(am_noticias_corpus)
print(a_not_n)
a_oth_n = len(am_other_corpus)
print(a_oth_n)
p_not_n = len(pen_noticias_corpus)
print(p_not_n)
p_oth_n = len(pen_other_corpus)
print(p_oth_n)

537999
1384410
302096
1384456


NB: other corpora are much larger than newspaper (noticias) corpora

In [10]:
## Search for ust/vos in corpora

a_not_tu = 0
a_oth_tu = 0
p_not_tu = 0
p_oth_tu = 0
a_not_usted = 0
a_oth_usted = 0
p_not_usted = 0
p_oth_usted = 0
a_not_vosotros = 0
a_oth_vosotros = 0
p_not_vosotros = 0
p_oth_vosotros = 0
a_not_ustedes = 0
a_oth_ustedes = 0
p_not_ustedes = 0
p_oth_ustedes = 0
a_not_yo = 0
a_oth_yo = 0
p_not_yo = 0
p_oth_yo = 0
a_not_nosotros = 0
a_oth_nosotros = 0
p_not_nosotros = 0
p_oth_nosotros = 0

for word in tu:
    a_not_tu += count_word(word, am_noticias_corpus)
    a_oth_tu += count_word(word, am_other_corpus)
    p_not_tu += count_word(word, pen_noticias_corpus)
    p_oth_tu += count_word(word, pen_other_corpus)

for word in usted:
    a_not_usted += count_word(word, am_noticias_corpus)
    a_oth_usted += count_word(word, am_other_corpus)
    p_not_usted += count_word(word, pen_noticias_corpus)
    p_oth_usted += count_word(word, pen_other_corpus)

for word in vosotros:
    a_not_vosotros += count_word(word, am_noticias_corpus)
    a_oth_vosotros += count_word(word, am_other_corpus)
    p_not_vosotros += count_word(word, pen_noticias_corpus)
    p_oth_vosotros += count_word(word, pen_other_corpus)

for word in ustedes:
    a_not_ustedes += count_word(word, am_noticias_corpus)
    a_oth_ustedes += count_word(word, am_other_corpus)
    p_not_ustedes += count_word(word, pen_noticias_corpus)
    p_oth_ustedes += count_word(word, pen_other_corpus)
    
for word in yo:
    a_not_yo += count_word(word, am_noticias_corpus)
    a_oth_yo += count_word(word, am_other_corpus)
    p_not_yo += count_word(word, pen_noticias_corpus)
    p_oth_yo += count_word(word, pen_other_corpus)
    
for word in nosotros:
    a_not_nosotros += count_word(word, am_noticias_corpus)
    a_oth_nosotros += count_word(word, am_other_corpus)
    p_not_nosotros += count_word(word, pen_noticias_corpus)
    p_oth_nosotros += count_word(word, pen_other_corpus)

In [11]:
## Get total counts

counts_dict_noticias = {'Am-Noticias': [a_not_tu, a_not_usted, a_not_vosotros, a_not_ustedes, a_not_yo, a_not_nosotros], 
               'Am-Other': [a_oth_tu, a_oth_usted, a_oth_vosotros, a_oth_ustedes, a_oth_yo, a_oth_nosotros],
               'Pen-Noticias': [p_not_tu, p_not_usted, p_not_vosotros, p_not_ustedes, p_not_yo, p_not_nosotros],
               'Pen-Other': [p_oth_tu, p_oth_usted, p_oth_vosotros, p_oth_ustedes, p_oth_yo, p_oth_nosotros]}

total_counts_noticias = pd.DataFrame(counts_dict_noticias, index = ['tu', 'usted', 'vosotros', 'ustedes', 'yo', 'nosotros'] )

print(total_counts_noticias)
print('\nSize of American-Noticias corpus: ' + str(a_not_n))
print('\nSize of American-Other corpus: ' + str(a_oth_n))
print('\nSize of Peninsular-Noticias corpus: ' + str(p_not_n))
print('\nSize of Peninsular-Other corpus: ' + str(p_oth_n) +'\n')
print(total_counts_noticias.sum())

          Am-Noticias  Am-Other  Pen-Noticias  Pen-Other
tu                 72      1104            25        757
usted              54      2196             7       3110
vosotros           51        46             6         84
ustedes            94       283            12        317
yo                762      3680            76       3940
nosotros          248       574            83        553

Size of American-Noticias corpus: 537999

Size of American-Other corpus: 1384410

Size of Peninsular-Noticias corpus: 302096

Size of Peninsular-Other corpus: 1384456

Am-Noticias     1281
Am-Other        7883
Pen-Noticias     209
Pen-Other       8761
dtype: int64


In [12]:
## Calculate proportional usage

a_not_sing = a_not_tu + a_not_usted
a_oth_sing = a_oth_tu + a_oth_usted
a_not_plur = a_not_vosotros + a_not_ustedes
a_oth_plur = a_oth_vosotros + a_oth_ustedes
p_not_sing = p_not_tu + p_not_usted
p_oth_sing = p_oth_tu + p_oth_usted
p_not_plur = p_not_vosotros + p_not_ustedes
p_oth_plur = p_oth_vosotros + p_oth_ustedes

prop_dict_noticias = {'Am-Noticias': [round((a_not_tu/a_not_sing), 2), 
                                    round((a_not_usted/a_not_sing), 2),
                                    round((a_not_vosotros/a_not_plur), 2),
                                    round((a_not_ustedes/a_not_plur), 2)],
                    'Am-Other': [round((a_oth_tu/a_oth_sing), 2), 
                                   round((a_oth_usted/a_oth_sing), 2),
                                   round((a_oth_vosotros/a_oth_plur), 2),
                                   round((a_oth_ustedes/a_oth_plur), 2)],  
                    'Pen-Noticias': [round((p_not_tu/p_not_sing), 2), 
                                     round((p_not_usted/p_not_sing), 2), 
                                     round((p_not_vosotros/p_not_plur), 2), 
                                     round((p_not_ustedes/p_not_plur), 2)],
                    'Pen-Other': [round((p_oth_tu/p_oth_sing), 2), 
                                     round((p_oth_usted/p_oth_sing), 2), 
                                     round((p_oth_vosotros/p_oth_plur), 2), 
                                     round((p_oth_ustedes/p_oth_plur), 2)]}

proportions_noticias = pd.DataFrame(prop_dict_noticias, index = ['tu:sing', 'usted:sing', 'vosotros:plur', 'ustedes:plur'])

print(proportions_noticias)

               Am-Noticias  Am-Other  Pen-Noticias  Pen-Other
tu:sing               0.57      0.33          0.78       0.20
usted:sing            0.43      0.67          0.22       0.80
vosotros:plur         0.35      0.14          0.33       0.21
ustedes:plur          0.65      0.86          0.67       0.79


### Analysis of non-fiction vs. fiction

In [13]:
## load corpora files

am_nonfiction_corpus = []
am_fiction_corpus =[]
pen_nonfiction_corpus = []
pen_fiction_corpus = []

for file in corpora_files:
    myspreader = SpanishReader.SpanishPlaintextCorpusReader(corpora_dir, file)
    text = myspreader.words()
    if re.search('a-xix-', file) or re.search('a-xx-', file):
        if re.search('a-xix-j-', file) or re.search('a-xix-n-', file) or re.search('a-xx-n-', file):
            for t in text:
                am_nonfiction_corpus.append(t)
        else: # if anything other than non-fiction (i.e., ficcion)
            for t in text:
                am_fiction_corpus.append(t)
    else: # if peninsular
        if re.search('p-xix-j-', file) or re.search('p-xix-n-', file):
            for t in text:
                pen_nonfiction_corpus.append(t)
        else: # if anything other than non-fiction (i.e., ficcion)
            for t in text:
                pen_fiction_corpus.append(t)

In [14]:
## Get size of corpora

a_non_n = len(am_nonfiction_corpus)
print(a_non_n)
a_fic_n = len(am_fiction_corpus)
print(a_fic_n)
p_non_n = len(pen_nonfiction_corpus)
print(p_non_n)
p_fic_n = len(pen_fiction_corpus)
print(p_fic_n)

882545
1039864
411780
1274772


In [15]:
## Search for ust/vos in corpora

a_non_tu = 0
a_fic_tu = 0
p_non_tu = 0
p_fic_tu = 0
a_non_usted = 0
a_fic_usted = 0
p_non_usted = 0
p_fic_usted = 0
a_non_vosotros = 0
a_fic_vosotros = 0
p_non_vosotros = 0
p_fic_vosotros = 0
a_non_ustedes = 0
a_fic_ustedes = 0
p_non_ustedes = 0
p_fic_ustedes = 0
a_non_yo = 0
a_fic_yo = 0
p_non_yo = 0
p_fic_yo = 0
a_non_nosotros = 0
a_fic_nosotros = 0
p_non_nosotros = 0
p_fic_nosotros = 0

for word in tu:
    a_non_tu += count_word(word, am_nonfiction_corpus)
    a_fic_tu += count_word(word, am_fiction_corpus)
    p_non_tu += count_word(word, pen_nonfiction_corpus)
    p_fic_tu += count_word(word, pen_fiction_corpus)

for word in usted:
    a_non_usted += count_word(word, am_nonfiction_corpus)
    a_fic_usted += count_word(word, am_fiction_corpus)
    p_non_usted += count_word(word, pen_nonfiction_corpus)
    p_fic_usted += count_word(word, pen_fiction_corpus)

for word in vosotros:
    a_non_vosotros += count_word(word, am_nonfiction_corpus)
    a_fic_vosotros += count_word(word, am_fiction_corpus)
    p_non_vosotros += count_word(word, pen_nonfiction_corpus)
    p_fic_vosotros += count_word(word, pen_fiction_corpus)

for word in ustedes:
    a_non_ustedes += count_word(word, am_nonfiction_corpus)
    a_fic_ustedes += count_word(word, am_fiction_corpus)
    p_non_ustedes += count_word(word, pen_nonfiction_corpus)
    p_fic_ustedes += count_word(word, pen_fiction_corpus)
    
for word in yo:
    a_non_yo += count_word(word, am_nonfiction_corpus)
    a_fic_yo += count_word(word, am_fiction_corpus)
    p_non_yo += count_word(word, pen_nonfiction_corpus)
    p_fic_yo += count_word(word, pen_fiction_corpus)
    
for word in nosotros:
    a_non_nosotros += count_word(word, am_nonfiction_corpus)
    a_fic_nosotros += count_word(word, am_fiction_corpus)
    p_non_nosotros += count_word(word, pen_nonfiction_corpus)
    p_fic_nosotros += count_word(word, pen_fiction_corpus)

In [16]:
## Get total counts

counts_dict_domain = {'Am-Nonfiction': [a_non_tu, a_non_usted, a_non_vosotros, a_non_ustedes, a_non_yo, a_non_nosotros], 
               'Am-Fiction': [a_fic_tu, a_fic_usted, a_fic_vosotros, a_fic_ustedes, a_fic_yo, a_fic_nosotros],
               'Pen-Nonfiction': [p_non_tu, p_non_usted, p_non_vosotros, p_non_ustedes, p_non_yo, p_non_nosotros],
               'Pen-Fiction': [p_fic_tu, p_fic_usted, p_fic_vosotros, p_fic_ustedes, p_fic_yo, p_fic_nosotros]}

total_counts_domain = pd.DataFrame(counts_dict_domain, index = ['tu', 'usted', 'vosotros', 'ustedes', 'yo', 'nosotros'] )

print(total_counts_domain)
print('\nSize of American-Nonfiction corpus: ' + str(a_non_n))
print('\nSize of American-Fiction corpus: ' + str(a_fic_n))
print('\nSize of Peninsular-Nonfiction corpus: ' + str(p_non_n))
print('\nSize of Peninsular-Fiction corpus: ' + str(p_fic_n) +'\n')
print(total_counts_domain.sum())

          Am-Nonfiction  Am-Fiction  Pen-Nonfiction  Pen-Fiction
tu                  145        1031              28          754
usted               368        1882              89         3028
vosotros             69          28               6           84
ustedes             128         249              16          313
yo                 1424        3018             355         3661
nosotros            403         419             188          448

Size of American-Nonfiction corpus: 882545

Size of American-Fiction corpus: 1039864

Size of Peninsular-Nonfiction corpus: 411780

Size of Peninsular-Fiction corpus: 1274772

Am-Nonfiction     2537
Am-Fiction        6627
Pen-Nonfiction     682
Pen-Fiction       8288
dtype: int64


In [17]:
## Calculate proportional usage

a_non_sing = a_non_tu + a_non_usted
a_fic_sing = a_fic_tu + a_fic_usted
a_non_plur = a_non_vosotros + a_non_ustedes
a_fic_plur = a_fic_vosotros + a_fic_ustedes
p_non_sing = p_non_tu + p_non_usted
p_fic_sing = p_fic_tu + p_fic_usted
p_non_plur = p_non_vosotros + p_non_ustedes
p_fic_plur = p_fic_vosotros + p_fic_ustedes

prop_dict_domain = {'Am-Nonfiction': [round((a_non_tu/a_non_sing), 2), 
                                    round((a_non_usted/a_non_sing), 2),
                                    round((a_non_vosotros/a_non_plur), 2),
                                    round((a_non_ustedes/a_non_plur), 2)],
                    'Am-Fiction': [round((a_fic_tu/a_fic_sing), 2), 
                                   round((a_fic_usted/a_fic_sing), 2),
                                   round((a_fic_vosotros/a_fic_plur), 2),
                                   round((a_fic_ustedes/a_fic_plur), 2)],  
                    'Pen-Nonfiction': [round((p_non_tu/p_non_sing), 2), 
                                     round((p_non_usted/p_non_sing), 2), 
                                     round((p_non_vosotros/p_non_plur), 2), 
                                     round((p_non_ustedes/p_non_plur), 2)],
                    'Pen-Fiction': [round((p_fic_tu/p_fic_sing), 2), 
                                     round((p_fic_usted/p_fic_sing), 2), 
                                     round((p_fic_vosotros/p_fic_plur), 2), 
                                     round((p_fic_ustedes/p_fic_plur), 2)]}

proportions_domain = pd.DataFrame(prop_dict_domain, index = ['tu:sing', 'usted:sing', 'vosotros:plur', 'ustedes:plur'])

print(proportions_domain)

               Am-Nonfiction  Am-Fiction  Pen-Nonfiction  Pen-Fiction
tu:sing                 0.28        0.35            0.24         0.20
usted:sing              0.72        0.65            0.76         0.80
vosotros:plur           0.35        0.10            0.27         0.21
ustedes:plur            0.65        0.90            0.73         0.79


### Appendix

#### Sanity check against Raul's original calculations

nos_vos_uds (AM SP first in each pair)

[[569, 553], [[46, 84], [283, 317]], [1383512, 1383502]]

In [18]:
## get counts for corpus w/o newspapers

print('American Spanish: ' + str(a_oth_n))
print('\nPeninsular Spanish: ' + str(p_oth_n))

American Spanish: 1384410

Peninsular Spanish: 1384456


In [23]:
## get nos, vos, uds counts

print('American Spanish:' + 
     '\nnos: ' + str(a_oth_nosotros) +
     '\nvos: ' + str(a_oth_vosotros) +
     '\nuds: ' + str(a_oth_ustedes) + '\n')

print('Peninsular Spanish:' +
     '\nnos: ' + str(p_oth_nosotros) +
     '\nvos: ' + str(p_oth_vosotros) +
     '\nuds: ' + str(p_oth_ustedes))

American Spanish:
nos: 574
vos: 46
uds: 283

Peninsular Spanish:
nos: 553
vos: 84
uds: 317


Counts are almost identical - nos count is higher here because this analysis included use of nos abbreviation 'nos.'