In [None]:
!pip install camel-tools

In [None]:
!camel_data -i all

In [None]:
## Example of MorphologicalTokenizer usage

from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

# Initialize disambiguators
mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
mle_egy = MLEDisambiguator.pretrained('calima-egy-r13')

# We expect a sentence to be whitespace/punctuation tokenized beforehand.
# We provide a simple whitespace and punctuation tokenizer as part of camel_tools.
# See camel_tools.tokenizers.word.simple_word_tokenize.
sentence_msa = ['فتنفست', 'الصعداء']
sentence_egy = ['وكاتباله', 'مكتوبين']

# Create different morphological tokenizer instances
msa_d3_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, split=True, scheme='d3tok')
msa_atb_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, split=True, scheme='atbtok')
msa_bw_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, split=True, scheme='bwtok')
egy_bw_tokenizer = MorphologicalTokenizer(disambiguator=mle_egy, split=True, scheme='bwtok')

# Generate tokenizations
# Note that our Egyptian resources currently provide bwtok tokenization only.
msa_d3_tok = msa_d3_tokenizer.tokenize(sentence_msa)
msa_atb_tok = msa_atb_tokenizer.tokenize(sentence_msa)
msa_bw_tok = msa_bw_tokenizer.tokenize(sentence_msa)
egy_bw_tok = egy_bw_tokenizer.tokenize(sentence_egy)

# Print results
print('D3 tokenization (MSA):', msa_d3_tok)
print('ATB tokenization (MSA):', msa_atb_tok)
print('BW tokenization (MSA):', msa_bw_tok)
print('BW tokenization (EGY):', egy_bw_tok)


In [None]:
# imports
import pandas as pd
import re
import emoji
from nltk.corpus import stopwords
import utils

from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.dialectid import DialectIdentifier
from camel_tools.utils.dediac import dediac_ar
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


In [None]:
train_set = pd.read_csv("Dataset/train.csv")
dev_set = pd.read_csv("Dataset/dev.csv")

In [None]:
## load pre-trained models

# dialect identification
did = DialectIdentifier.pretrained()

# Initialize disambiguators
msa_mle = MLEDisambiguator.pretrained('calima-msa-r13')
egy_mle = MLEDisambiguator.pretrained('calima-egy-r13')
#TODO: this stopword list has more words and some of them should be removed from tweets?
stopwords_list_cleaned = stopwords.words('arabic')
for i in range(len(stopwords_list_cleaned)):
  stopwords_list_cleaned[i] = dediac_ar(stopwords_list_cleaned[i])
stopwords_list_cleaned = list(dict.fromkeys(stopwords_list_cleaned))

dialect_map = {
    'Gulf': msa_mle,
    'Levant': egy_mle, 
    'Modern Standard Arabic': msa_mle, 
    'Maghreb': egy_mle, 
    'Nile Basin': egy_mle, 
    'Gulf of Aden': msa_mle
}

In [None]:
def preprocess(dataset):
  cleaned_dataset = dataset.copy()

  cleaned_dataset['text'] = cleaned_dataset['text'].apply(utils.clean_tweet)
  cleaned_dataset['text'] = cleaned_dataset['text'].apply(lambda tweet: utils.tokenize_with_dialect(tweet, did, msa_mle=msa_mle, egy_mle=egy_mle))
  cleaned_dataset['text'] = cleaned_dataset['text'].apply(utils.normalize_chars)

  # merge dup tweets and use the most occuring classes (TODO: doesn't keep the order for some reason ...) 
  # IMPORTANT: you can comment the following line to be able to compare the results with the original dataset (same order)
  cleaned_dataset = cleaned_dataset.groupby(cleaned_dataset['text']).agg(pd.Series.mode)

  return cleaned_dataset

In [None]:
cleaned_train_set = preprocess(train_set)

# save the cleaned data
cleaned_train_set.to_csv("./Dataset/cleaned_train.csv", index=False)