In [15]:
from datasets import load_dataset, Dataset
import csv
import pandas as pd
import nltk
import re
import spacy
import stanfordnlp
import stanza

# Loading Stanza models for Arabic
stanza.download('ar')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 53.5MB/s]                    
2024-04-21 21:53:57 INFO: Downloaded file to /Users/perrine/stanza_resources/resources.json
2024-04-21 21:53:57 INFO: Downloading default packages for language: ar (Arabic) ...
Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.8.0/models/default.zip: 100%|██████████| 460M/460M [00:17<00:00, 25.9MB/s] 
2024-04-21 21:54:16 INFO: Downloaded file to /Users/perrine/stanza_resources/ar/default.zip
2024-04-21 21:54:17 INFO: Finished downloading models and saved to /Users/perrine/stanza_resources


In [16]:
data = pd.read_csv("../../data/csv/data_balanced.csv")

# Create a dataset from the pandas dataframe
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['Label', 'Text'],
    num_rows: 99
})

In [18]:
# Morpho-syntaxical annotation

# Tokenizers
# tokenizer_bert = BertTokenizer.from_pretrained('bert-base-multilingual-cased') # Load a multilingual BERT tokenizer
# tokenizer_kor = Okt() # Korean tokenizer
# tokenizer_ja = Tagger('-Owakati') # Japanese tokenizer

# Loading SpaCy models
nlp_en = spacy.load("en_core_web_sm")  # English
nlp_es = spacy.load("es_core_news_sm")  # Spanish
nlp_de = spacy.load("de_core_news_sm")  # German
nlp_fr = spacy.load("fr_core_news_sm")  # French
nlp_ru = spacy.load("ru_core_news_sm")  # Russian
nlp_zh = spacy.load("zh_core_web_sm")  # Chinese
nlp_ja = spacy.load("ja_core_news_sm")  # Japanese
nlp_ko = spacy.load("ko_core_news_sm")  # Korean

nlp_ar = stanza.Pipeline(lang='ar')

def tokenize_and_annotate(text, lang):
    if lang == 'zh':
        doc = nlp_zh(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'ko':
        doc = nlp_ko(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'ar':
        doc = nlp_ar(text)
        return [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
    elif lang == 'ja':
        doc = nlp_ja(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'ru':
        doc = nlp_ru(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'en':
        doc = nlp_en(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'es':
        doc = nlp_es(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'de':
        doc = nlp_de(text)
        return [(token.text, token.pos_) for token in doc]
    elif lang == 'fr':
        doc = nlp_fr(text)
        return [(token.text, token.pos_) for token in doc]
    
for sentence in dataset:
    text = sentence['Text']
    lang = sentence['Label']
    sentence['tokens_pos'] = tokenize_and_annotate(text, lang)
    print(lang, sentence['tokens_pos'])


2024-04-21 21:55:38 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 38.5MB/s]                    
2024-04-21 21:55:38 INFO: Downloaded file to /Users/perrine/stanza_resources/resources.json
2024-04-21 21:55:38 INFO: Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

2024-04-21 21:55:38 INFO: Using device: cpu
2024-04-21 21:55:38 INFO: Loading: tokenize
2024-04-21 21:55:38 INFO: Loading: mwt
2024-04-21 21:55:38 INFO: Loading: pos
2024-04-21 21:55:39 INFO: Loading: lemma
2024-04-21 21:55:39 INFO: 

ja [('ブルームフィールド', 'NOUN'), ('郡区', 'NOUN'), ('(', 'PUNCT'), ('Brookfield', 'PROPN'), ('Township', 'PROPN'), (')', 'PUNCT'), ('は', 'ADP'), ('、', 'PUNCT'), ('アメリカ', 'PROPN'), ('合衆', 'NOUN'), ('国', 'NOUN'), ('アイオワ', 'PROPN'), ('州', 'NOUN'), ('クリントン', 'PROPN'), ('郡', 'NOUN'), ('の', 'ADP'), ('郡区', 'NOUN'), ('。', 'PUNCT'), ('年', 'NOUN'), ('国勢', 'NOUN'), ('調査', 'NOUN'), ('で', 'ADP'), ('は', 'ADP'), ('、', 'PUNCT'), ('人口', 'NOUN'), ('は', 'ADP'), ('人', 'NOUN'), ('。', 'PUNCT'), ('ブルームフィールド', 'NOUN'), ('郡区', 'NOUN'), ('は', 'ADP'), ('年', 'NOUN'), ('に', 'ADP'), ('設立', 'VERB'), ('さ', 'AUX'), ('れ', 'AUX'), ('た', 'AUX'), ('。', 'PUNCT')]
ja [('ロンドン', 'PROPN'), ('・', 'SYM'), ('ガトウィック', 'PROPN'), ('空港', 'NOUN'), ('（', 'NOUN'), ('ロンドン', 'PROPN'), ('・', 'SYM'), ('ガトウィック', 'PROPN'), ('くう', 'NOUN'), ('こう', 'ADV'), ('）', 'NOUN'), ('は', 'ADP'), ('、', 'PUNCT'), ('ロンドン', 'PROPN'), ('中心', 'NOUN'), ('部', 'NOUN'), ('から', 'ADP'), ('南', 'NOUN'), ('.', 'PUNCT'), ('km', 'NOUN'), ('、', 'PUNCT'), ('ウェスト', 'PROPN'), ('・', 'S