In [1]:
import pandas as pd
from tqdm.autonotebook import tqdm
from pathlib import Path
from dataclasses import dataclass, field
from typing import List
from lxml import etree


@dataclass
class Item:
    id: int = None
    author: str = None
    journal_title: str = None
    book_title: str = None
    book_chapter: str = None
    article_title: str = None
    year: str =None
    file: str = None
    collection: str = None
    jstor_url: str = None
    ngrams: List[tuple] = field(default_factory=list)
    text: str = None
    
    #list of all tokens 
    def tokens(self):
        tokens = [a[0] for a in self.ngrams]
        return tokens
        
    #frequency for a specific token
    def token_freq(self, token:str):
        a = [a for a in self.ngrams if a[0] == token]
        return int(a[0][1])

    #frequencies for a list of token
    def token_list(self, tokens:list):
        return [a for a in self.ngrams if a[0] in tokens]
           
    def hatebase_terms(self):
        intersection = set([a[0].lower() for a in self.ngrams]).intersection(set(hatebase))
        return [a for a in self.ngrams if a[0].lower() in intersection] 
        
     

  


In [2]:
def make_items(path):
    current_dir = Path(path)
    return_items = []
    if (current_dir / 'metadata').exists():
        count = 0 
        for file in tqdm((current_dir / 'metadata').iterdir(), 
                         total=len(list((current_dir / 'metadata').iterdir()))):
            item = Item()
            item.id = count
            count += 1
            item.file = str(file.name)
            item.collection = str(current_dir).split('/')[-1]
            
            
            root = etree.fromstring(file.read_bytes())
            try:
                item.jstor_url = root.xpath('//self-uri')[0].attrib['{http://www.w3.org/1999/xlink}href']
            except IndexError:
                continue
            if 'article' in file.name:
                try:
                    item.journal_title = root.xpath('//journal-title')[0].text
                except IndexError:
                    continue
                if root.xpath('//pub-date/year'):
                    item.year = root.xpath('//pub-date/year')[0].text
                #if root.xpath('//copyright-year'):
                #    item.year = root.xpath('//copyright-year')[0].text
                if root.xpath('//string-name'):
                    item.author = root.xpath('//string-name')[0].text #only first author
                if root.xpath('//article-title'): #TODO not working, no results
                    article_title = root.xpath('//article-title')[0].text 
            if 'book' in file.name:
                if root.xpath('//book-title'):
                    item.book_title = root.xpath('//book-title')[0].text
                book_part_id = file.stem.split('_')[-1]
                i = int(book_part_id.split('.')[-1])
                try:
                    item.book_chapter = root.xpath('//title')[i].text   
                except IndexError:
                    pass
            
            sub_dirs = ['ngram1','ngram2','ngram3']
            for sub_dir in sub_dirs:
                #book-chapter-10.2307_j.ctt1pc5dgp.4.xml maps to 
                #book-chapter-10.2307_j.ctt1pc5dgp.4-ngram1.txt
                txt_file = str(file.stem)+'-'+sub_dir+'.txt'
                ngram_file = (current_dir / sub_dir / txt_file )
                if ngram_file.exists():
                    text = ngram_file.read_text()
                    item.text = text
                    
            return_items.append(item)
            
        return return_items
                
                
    
    else:
        print('Missing metadata directory')
        raise SystemExit(0)
        
    
directories = ['/home/ajanco/projects/slavic_review/slavic_review_data/SlavicStudiesCluster1991to2020'] #,'/home/ajanco/projects/slavic_review/slavic_review_data/AfricanAmericanStudiesCluster1985to2020']
main_items = []
for directory in directories:
    main_items.extend(make_items(directory))

    

HBox(children=(FloatProgress(value=0.0, max=41250.0), HTML(value='')))




In [3]:
gypsy = [item for item in main_items if 'gypsy' or 'roma' in item.text]

In [7]:
all = [a for a in gypsy if a.year]
all[0].text.find('gypsy')

-1

In [16]:
doc = nlp(all[140].text)
matches = matcher(doc)
try:
    label = matcher.vocab.strings[matches[0][0]]
    print(label, item.year)
except: #no matches
    print('no')


roma 2005


In [19]:
import spacy 
from spacy.matcher import Matcher
nlp = spacy.load('xx_ent_wiki_sm')
nlp.max_length = 4523559
matcher = Matcher(nlp.vocab)
gypsy = [{"LOWER": "gypsy"}]
matcher.add("gypsy", None, gypsy)
gypsies = [{"LOWER": "gypsies"}]
matcher.add("gypsies", None, gypsies)
roma = [{"LOWER": "roma"}]
matcher.add("roma", None, roma)
romani = [{"LOWER": "romani"}]
matcher.add("romani", None, romani)

results = []
for item in tqdm(main_items): 
    if item.text:
        doc = nlp(item.text)
        matches = matcher(doc)
        try:
            label = matcher.vocab.strings[matches[0][0]]
            results.append((label, item.year))
        except: #no matches
            pass

HBox(children=(FloatProgress(value=0.0, max=41250.0), HTML(value='')))




In [65]:
results.value_counts()

AttributeError: 'list' object has no attribute 'value_counts'

In [21]:
import pickle 
pickle.dump( results, open( "roma_results.p", "wb" ) )

In [86]:
import plotly.express as px
df = pd.DataFrame(results, columns =['term', 'year']) 
df.apply(pd.Series.value_counts)


Unnamed: 0,term,year
1991,,32.0
1992,,35.0
1993,,36.0
1994,,43.0
1995,,51.0
1996,,54.0
1997,,44.0
1998,,57.0
1999,,61.0
2000,,57.0


In [91]:
df = pd.DataFrame(results, columns =['term', 'year']) 
df.groupby(["term", "year"]).size().reset_index(name="count")
#df.to_csv('roma_counts.csv')
df

Unnamed: 0,term,year
0,roma,2013
1,roma,2015
2,roma,2004
3,roma,1993
4,gypsy,2002
...,...,...
1929,roma,2013
1930,roma,
1931,gypsy,2008
1932,gypsy,2009


In [4]:
import pandas as pd
df = pd.read_csv('roma_counts.csv')

In [5]:
df.rename_axis("count")

Unnamed: 0_level_0,Unnamed: 0,term,year
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,roma,2013.0
1,1,roma,2015.0
2,2,roma,2004.0
3,3,roma,1993.0
4,4,gypsy,2002.0
...,...,...,...
1929,1929,roma,2013.0
1930,1930,roma,
1931,1931,gypsy,2008.0
1932,1932,gypsy,2009.0


In [10]:
import plotly
roma = df[df['term'] == 'roma']
roma.head()

Unnamed: 0.1,Unnamed: 0,term,year
0,0,roma,2013.0
1,1,roma,2015.0
2,2,roma,2004.0
3,3,roma,1993.0
5,5,roma,2010.0


In [16]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('roma_counts.csv')

fig = px.scatter(df, x="year", y="term",
	         size="Unnamed: 0", color="year",
                 hover_name="year", log_x=True, size_max=60)
fig.show()