In [1]:
from pathlib import Path
import requests
from lxml import etree
from tqdm import tqdm
from collections import Counter
import pandas as pd
from typing import List
from dataclasses import dataclass, field

example journal-article-10.2307_4214358.xml

```xml
<?xml version="1.0" encoding="UTF-8"?>
<article xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xmlns:xlink="http://www.w3.org/1999/xlink"
         xmlns:mml="http://www.w3.org/1998/Math/MathML"
         dtd-version="1.0"
         article-type="research-article">
   <front>
      <journal-meta>
         <journal-id journal-id-type="jstor">slaveasteurorev2</journal-id>
         <journal-id journal-id-type="jstor">j101339</journal-id>
         <journal-title-group>
            <journal-title>The Slavonic and East European Review</journal-title>
         </journal-title-group>
         <publisher>
            <publisher-name>Maney Publishing</publisher-name>
         </publisher>
         <issn pub-type="ppub">00376795</issn>
         <issn pub-type="epub">22224327</issn>
         <custom-meta-group/>
      </journal-meta>
      <article-meta>
         <article-id pub-id-type="jstor">4214358</article-id>
         <article-categories>
            <subj-group>
               <subject>Language</subject>
            </subj-group>
         </article-categories>
         <title-group>
            <article-title>Russian and Soviet Loanwords and Calques in the Czech Lexicon since the Beginning of the Twentieth Century</article-title>
         </title-group>
         <contrib-group>
```

example journal-article-10.2307_4214358-ngram1.txt
```text
Russian	207
czech	163
soviet	111
from	108
were	98
which	97
s	86
have	79
lexical	72
terms	67
more	65
other	60
pp	59
```

In [2]:
hatebase = pd.read_csv('slavic_review_data/hatebase.csv')

hatebase

Unnamed: 0,vocabulary_id,term,hateful_meaning,nonhateful_meaning,is_unambiguous,is_unambiguous_in,average_offensiveness,language,plural_of,variant_of,...,is_about_religion,is_about_gender,is_about_sexual_orientation,is_about_disability,is_about_class,number_of_sightings,number_of_sightings_this_year,number_of_sightings_this_month,created_on,updated_on
0,CUNwTAUXn,मुल्ला,Calling all Muslims Mulla and giving bad words...,Muslim Cleric,False,,,hin,,,...,True,False,False,False,False,0,0,0,2020-07-30 14:56:35,2020-07-30 14:56:35
1,rcgtvVbun,Mulla,Calling all Muslims Mulla and giving bad words...,Muslim Cleric,False,,,hin,,,...,True,False,False,False,False,1,1,0,2020-07-28 21:26:39,2020-07-30 14:52:38
2,VyYrjVnfq,खतना,"The word ""katwa"" refers to the fact that the f...",,False,,,hin,,,...,True,True,False,False,False,0,0,0,2020-07-30 14:51:25,2020-07-30 14:51:25
3,HKhqikufd,कटवा,"The word ""katwa"" refers to the fact that the f...",,False,,,hin,,,...,True,True,False,False,False,0,0,0,2020-07-30 14:50:23,2020-07-30 14:50:23
4,p5BxVyLvx,katwa,"The word ""katwa"" refers to the fact that the f...",,False,,100.0,hin,,,...,True,True,False,False,False,2,1,0,2020-07-28 22:04:15,2020-07-30 14:48:14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695,R6Mha4Epg,paki,"A person from South Asia or the Middle East, p...",,False,,90.0,eng,,,...,False,False,False,False,False,245,30,2,2013-04-02 04:00:00,2013-04-02 04:00:00
3696,4DP7xEU94,judas,Jew; derogatory by association with Judas Isca...,,False,,,spa,,,...,True,False,False,False,True,17,4,0,2013-04-01 05:55:56,2013-04-01 05:55:56
3697,hyxj2JhNJ,aracuano,"Arab, person of Lebanese, Syrian, Iraqi or oth...",,False,,,spa,,,...,False,False,False,False,False,0,0,0,2013-04-01 05:53:43,2013-04-01 05:53:43
3698,fVjd2aZKa,naco,A pejorative word often used in Mexican Spanis...,,False,MX,50.0,spa,,,...,False,False,False,False,True,2237,123,3,2013-04-01 05:32:13,2013-04-01 05:32:13


In [3]:
# Full list (but not very helpful)
hatebase = hatebase.term.str.lower().tolist() #all JSTOR terms are lowercase, so converting hatebase to lower

In [4]:
# A shorted list of term that are present in the corpus and suggest need for investigation
short_hatebase = [
"negro",
"nigger",
"retarded",
"yid",
"whore",
"retard",
"whores",
"kike",
"bitches",
"khokhol",
"redskin",
"faggot",
"khokhols",
"dykes",
"faggots",
"half breeds",
"darkie",
"sluts",
"tar baby",
"palefaces",
"yellows",
"zhidovka",
"homos",
"coons",
"fags",
"gey",
"suka",
"chukchi",
]

In [5]:
@dataclass
class Item:
    id: int = None
    author: str = None
    journal_title: str = None
    book_title: str = None
    book_chapter: str = None
    article_title: str = None
    year: str =None
    file: str = None
    collection: str = None
    jstor_url: str = None
    ngrams: List[tuple] = field(default_factory=list)
    
    #list of all tokens 
    def tokens(self):
        tokens = [a[0] for a in self.ngrams]
        return tokens
        
    #frequency for a specific token
    def token_freq(self, token:str):
        a = [a for a in self.ngrams if a[0] == token]
        return int(a[0][1])

    #frequencies for a list of token
    def token_list(self, tokens:list):
        return [a for a in self.ngrams if a[0] in tokens]
           
    def hatebase_terms(self):
        intersection = set([a[0].lower() for a in self.ngrams]).intersection(set(hatebase))
        return [a for a in self.ngrams if a[0].lower() in intersection] 
        
     

In [6]:
def make_items(path):
    current_dir = Path(path)
    return_items = []
    if (current_dir / 'metadata').exists():
        count = 0 
        for file in tqdm((current_dir / 'metadata').iterdir(), 
                         total=len(list((current_dir / 'metadata').iterdir()))):
            item = Item()
            item.id = count
            count += 1
            item.file = str(file.name)
            item.collection = str(current_dir).split('/')[-1]
            
            
            root = etree.fromstring(file.read_bytes())
            try:
                item.jstor_url = root.xpath('//self-uri')[0].attrib['{http://www.w3.org/1999/xlink}href']
            except IndexError:
                continue
            if 'article' in file.name:
                try:
                    item.journal_title = root.xpath('//journal-title')[0].text
                except IndexError:
                    continue
                if root.xpath('//copyright-year'):
                    item.year = root.xpath('//copyright-year')[0].text
                if root.xpath('//string-name'):
                    item.author = root.xpath('//string-name')[0].text #only first author
                if root.xpath('//article-title'): #TODO not working, no results
                    article_title = root.xpath('//article-title')[0].text 
            if 'book' in file.name:
                if root.xpath('//book-title'):
                    item.book_title = root.xpath('//book-title')[0].text
                book_part_id = file.stem.split('_')[-1]
                i = int(book_part_id.split('.')[-1])
                try:
                    item.book_chapter = root.xpath('//title')[i].text   
                except IndexError:
                    pass
            
            sub_dirs = ['ngram1','ngram2','ngram3']
            for sub_dir in sub_dirs:
                #book-chapter-10.2307_j.ctt1pc5dgp.4.xml maps to 
                #book-chapter-10.2307_j.ctt1pc5dgp.4-ngram1.txt
                txt_file = str(file.stem)+'-'+sub_dir+'.txt'
                ngram_file = (current_dir / sub_dir / txt_file )
                if ngram_file.exists():
                    text = ngram_file.read_text()
                    for line in text.split('\n'):
                        word=line.split('\t')[0]
                        freq=line.split('\t')[-1]
                        item.ngrams.append((word,freq))
            return_items.append(item)
            
        return return_items
                
                
    
    else:
        print('Missing metadata directory')
        raise SystemExit(0)
        
    
directories = ['/home/ajanco/projects/slavic_review/slavic_review_data/SlavicStudiesCluster1991to2020'] #,'/home/ajanco/projects/slavic_review/slavic_review_data/AfricanAmericanStudiesCluster1985to2020']
main_items = []
for directory in directories:
    main_items.extend(make_items(directory))

    

100%|██████████| 41250/41250 [03:21<00:00, 204.48it/s]


In [None]:
import pickle 
pickle.dump( main_items, open( "slav_stud_items.p", "wb" ) )



In [5]:
import pickle
main_items = pickle.load( open( "slav_stud_items.p", "rb" ) )

AttributeError: Can't get attribute 'Item' on <module '__main__'>

In [12]:
import gender_guesser.detector as gender
d = gender.Detector()
d.get_gender("Igal")

gender = [d.get_gender(a.author.split(' ')[0]) for a in main_items if a.author]

Counter(gender)

Counter({'unknown': 32106,
         'female': 776,
         'male': 1549,
         'andy': 3,
         'mostly_male': 26,
         'mostly_female': 28})

In [13]:
years = [a.year for a in main_items if a.year]
Counter(years)

Counter({'2014': 458,
         '2013': 459,
         '2016': 436,
         '2015': 374,
         '2011': 502,
         '2012': 460})

In [7]:
authors = [a.author for a in main_items if a.author]
Counter(authors).most_common(50)

[('\n                  ', 30992),
 ('\n                  \n                     ', 146),
 ('Peter J. S. Duncan', 25),
 ('Philip Ross Bullock', 24),
 ('J. Ian Press', 24),
 ('A', 21),
 ('J', 19),
 ('S', 17),
 ('M', 17),
 ('\n            \n                  ', 17),
 ('D', 15),
 ('Dennis Deletant', 15),
 ('P', 15),
 ('David A. J. Macey', 14),
 ('Andrii Danylenko', 14),
 ('Arnold McMillin', 14),
 ('R', 14),
 ('Nick Ukiah', 13),
 ('Denis J. B. Shaw', 12),
 ('E. L.', 12),
 ('G', 11),
 ('D. P. K.', 10),
 ('Elizabeth Skomp', 10),
 ('T. Haughton', 10),
 ('\n                   Stanislava Janáčková \n              ', 10),
 ('Kevin M. F. Platt', 10),
 ('C', 10),
 ('Sarah J. Young', 10),
 ('Tom Dickins', 9),
 ('\n               ', 9),
 ('A. W.', 9),
 ('Alexander Morrison', 9),
 ('E.D.M.', 8),
 ('Avril Pyman', 8),
 ('Denise J. Youngblood', 8),
 ('Muireann Maguire', 8),
 ('Peter Westin', 7),
 ('M. D. Pittaway', 7),
 ('K', 7),
 ('J. D. Klier', 7),
 ('У И-и', 7),
 ('Martyn Rady', 7),
 ('Cesar Ballester

In [8]:
article_title = [a.article_title for a in main_items if a.article_title]
Counter(article_title).most_common()

[]

In [20]:
data = []
for text in tqdm(main_items): 
    row = {}
    row['url'] = text.jstor_url
    row['file'] = text.file
    row['hate_terms'] = text.token_list(short_hatebase) #text.hatebase_terms()
    row['hate_tokens'] = [a[0] for a in row['hate_terms']]
    row['number_of_terms'] = len(row['hate_terms'])

    data.append(row)
df = pd.DataFrame(data)
pd.set_option('display.max_colwidth', -1)

def make_clickable(val):
    return '<a target="_blank" href="{}">{}</a>'.format(val,val)

df2= df[['url','file','hate_tokens','number_of_terms']].sort_values('number_of_terms', ascending=False).head(50)
df2.style.format({'url': make_clickable})

100%|██████████| 41250/41250 [01:23<00:00, 495.16it/s]
  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,url,file,hate_tokens,number_of_terms
17696,https://www.jstor.org/stable/27896141,journal-article-10.2307_27896141.xml,"['nigger', 'negro', 'yid']",3
1316,https://www.jstor.org/stable/j.ctvv4180c,book-chapter-10.2307_j.ctvv4180c.12.xml,"['yid', 'negro', 'whore']",3
32345,https://www.jstor.org/stable/10.5325/j.ctv14gp02w,book-chapter-10.5325_j.ctv14gp02w.7.xml,"['negro', 'whore', 'whores']",3
2189,https://www.jstor.org/stable/j.ctvqsdscs,book-chapter-10.2307_j.ctvqsdscs.14.xml,"['nigger', 'whore', 'whores']",3
3019,https://www.jstor.org/stable/20620927,journal-article-10.2307_20620927.xml,"['whore', 'yid', 'zhidovka']",3
2011,https://www.jstor.org/stable/j.ctvqsdt8s,book-chapter-10.2307_j.ctvqsdt8s.18.xml,"['yid', 'whore', 'zhidovka']",3
23328,https://www.jstor.org/stable/40870783,journal-article-10.2307_40870783.xml,"['kike', 'sluts', 'yid']",3
39790,https://www.jstor.org/stable/j.ctv1220mxt,book-chapter-10.2307_j.ctv1220mxt.10.xml,"['faggot', 'kike', 'zhidovka']",3
32645,https://www.jstor.org/stable/43676591,journal-article-10.2307_43676591.xml,"['chukchi', 'faggot', 'whores']",3
31322,https://www.jstor.org/stable/24600167,journal-article-10.2307_24600167.xml,"['whore', 'faggot', 'kike']",3


In [2]:
directories = ['/home/ajanco/projects/slavic_review/slavic_review_data/SlavicStudiesCluster1991to2020','/home/ajanco/projects/slavic_review/slavic_review_data/AfricanAmericanStudiesCluster1985to2020']
    
data = []
for directory in directories:
    current_dir = Path(directory)
    for file in tqdm((current_dir / 'metadata').iterdir(), total=len(list((current_dir / 'metadata').iterdir()))):
        row= {}
        row['category'] = str(current_dir).split('/')[-1]
        row['text'] = ''
        sub_dirs = ['ngram1','ngram2','ngram3']
        for sub_dir in sub_dirs:
            txt_file = str(file.stem)+'-'+sub_dir+'.txt'
            ngram_file = (current_dir / sub_dir / txt_file )
            if ngram_file.exists():
                text = ngram_file.read_text()
                for line in text.split('\n'):
                    word=line.split('\t')[0]
                    row['text'] += word + ' '

        data.append(row)
df = pd.DataFrame(data)
df.head()

100%|██████████| 41250/41250 [24:22<00:00, 28.21it/s]   
100%|██████████| 57958/57958 [29:48<00:00, 32.40it/s]    


Unnamed: 0,category,text
0,SlavicStudiesCluster1991to2020,b h he i 3 ukrainian ha c 1 m r t 2 were 3a ob...
1,SlavicStudiesCluster1991to2020,he had i his chernobyl from were arnolds been ...
2,SlavicStudiesCluster1991to2020,в с со во к о before ко fleeting forms russian...
3,SlavicStudiesCluster1991to2020,s russian ginzburg literary fi her chapter slo...
4,SlavicStudiesCluster1991to2020,beissinger events he most nationalist one s mo...


In [4]:
import scattertext as st
import spacy 

# Turn it into a Scattertext Corpus 
nlp = spacy.load('en_core_web_md')
nlp.max_length = 2325753
corpus = st.CorpusFromPandas(df,
                             category_col='category', 
                             text_col='text',
                             nlp=nlp).build()

ValueError: [E088] Text of length 3209174 exceeds maximum of 2325753. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.