Reference: https://colab.research.google.com/drive/1Y3qCbD6Gw1KEw-lixQx1rI6WlyWnrnDS?usp=sharing#scrollTo=U7JXVSeA8-rr


## **Installation and Setup**

In [1]:
%pip install camel-tools

Collecting camel-tools
[?25l  Downloading https://files.pythonhosted.org/packages/06/23/331ce904926a8d53a527aac34bfe03fffb9fd1d4597924cbcd65432a53ef/camel_tools-1.1.0.tar.gz (56kB)
[K     |█████▉                          | 10kB 9.9MB/s eta 0:00:01[K     |███████████▋                    | 20kB 15.4MB/s eta 0:00:01[K     |█████████████████▌              | 30kB 20.1MB/s eta 0:00:01[K     |███████████████████████▎        | 40kB 17.1MB/s eta 0:00:01[K     |█████████████████████████████▏  | 51kB 16.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 4.3MB/s 
Collecting transformers==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 12.7MB/s 
Collecting camel-kenlm
[?25l  Downloading https://files.pythonhosted.org/packages/e0/4e/147d258c7168b8f538e6aa7c4dc602b2bb696452502af8af35876a28de78/camel

In [2]:
#Mount drive
from google.colab import drive
import os

drive.mount('/gdrive')

%mkdir /gdrive/MyDrive/camel_tools

Mounted at /gdrive


In [17]:
#Install the data
os.environ['CAMELTOOLS_DATA'] = '/gdrive/MyDrive/camel_tools'

!export | camel_data full #light or or full

In [None]:
#If it was not the first time...
#%pip install camel-tools

#from google.colab import drive
#import os

#drive.mount('/gdrive')
#os.environ['CAMELTOOLS_DATA'] = '/gdrive/MyDrive/camel_tools'

In [5]:
#Space and punctuation splitting التقسيم على الفراغات وعلامات الترقيم
#The simple_word_tokenize function will convert any text to its tokenized list.
from camel_tools.tokenizers.word import simple_word_tokenize
sentence= "هل ذهبت إلى المكتبة؟"
print (sentence)

sent_split= simple_word_tokenize(sentence)
print (sent_split)

#split() method to tokenize words by whitespace but it doesn't seperate punctuation from words

هل ذهبت إلى المكتبة؟
['هل', 'ذهبت', 'إلى', 'المكتبة', '؟']


In [6]:
#Unicode Normalization تطبيع اليونيكود
from camel_tools.utils.normalize import normalize_unicode
sentence= "ﷺ"
print (sentence)

sent_norm= normalize_unicode(sentence)
print (sent_norm)

ﷺ
صلى الله عليه وسلم


In [7]:
#Orthographic Normalization التنميط الإملائي
from camel_tools.utils.normalize import normalize_alef_maksura_ar #Without needing to use regex nor indicating what letter to replace with what other letter, thereby consistency in datasets
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
#from camel_tools.utils.normalize import n

sentence= "هل ذهبت إلى المكتبة؟"
print (sentence)

sent_norm2= normalize_alef_ar(sentence)
sent_norm2= normalize_alef_maksura_ar(sent_norm2)
sent_norm2=normalize_teh_marbuta_ar(sent_norm2)
print(sent_norm2)

هل ذهبت إلى المكتبة؟
هل ذهبت الي المكتبه؟


In [8]:
#Diacritization إزالة التشكيل
from camel_tools.utils.dediac import  dediac_ar
sentence= "أَنَا اسمِي نورَة"
print (sentence)

sent_dediac= dediac_ar(sentence)
print (sent_dediac)

أَنَا اسمِي نورَة
أنا اسمي نورة


In [10]:
#Morphological Analysis التحليل الصرفي(this is where you can extract stems and lemmas)
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer

# First, we need to load a morphological database.
# Here, we load the default database which is used for analyzing
# Modern Standard Arabic. 
db = MorphologyDB.builtin_db()

analyzer = Analyzer(db)

analyses = analyzer.analyze('موظف')

for analysis in analyses:
    print(analysis, '\n')

#Morphological analysis is the process of generating all possible readings (analyses) of a given word out of context. 
#All analyses are generated from the undiacritized form of the input word. 
#Each of these analyses is defined by a set lexical and morphological features.
#https://camel-tools.readthedocs.io/en/latest/reference/camel_morphology_features.html

{'diac': 'مُوَظَّف', 'lex': 'مُوَظَّف_2', 'bw': 'مُوَظَّف/ADJ', 'gloss': 'employed;hired', 'pos': 'adj', 'prc3': '0', 'prc2': '0', 'prc1': '0', 'prc0': '0', 'per': 'na', 'asp': 'na', 'vox': 'na', 'mod': 'na', 'stt': 'i', 'cas': 'u', 'enc0': '0', 'rat': 'n', 'source': 'lex', 'form_gen': 'm', 'form_num': 's', 'pattern': 'مُوَ2َّ3', 'root': '#.ظ.ف', 'catib6': 'NOM', 'ud': 'ADJ', 'd1seg': 'مُوَظَّف', 'd1tok': 'مُوَظَّف', 'atbseg': 'مُوَظَّف', 'd3seg': 'مُوَظَّف', 'd2seg': 'مُوَظَّف', 'd2tok': 'مُوَظَّف', 'atbtok': 'مُوَظَّف', 'd3tok': 'مُوَظَّف', 'bwtok': 'مُوَظَّف', 'pos_lex_logprob': -5.400551, 'caphi': 'm_u_w_a_dh._dh._a_f', 'pos_logprob': -0.9868824, 'gen': 'm', 'lex_logprob': -5.400551, 'num': 's', 'stem': 'مُوَظَّف', 'stemgloss': 'employed;hired', 'stemcat': 'N-ap'} 

{'diac': 'مُوَظَّفِ', 'lex': 'مُوَظَّف_2', 'bw': 'مُوَظَّف/ADJ+ِ/CASE_DEF_GEN', 'gloss': 'employed;hired+[def.gen.]', 'pos': 'adj', 'prc3': '0', 'prc2': '0', 'prc1': '0', 'prc0': '0', 'per': 'na', 'asp': 'na', 'vox': 'n

In [12]:
#POS
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tagger.default import DefaultTagger

mle = MLEDisambiguator.pretrained()
tagger = DefaultTagger(mle, 'pos')

# The tagger expects pre-tokenized text
sentence = simple_word_tokenize('نجح بايدن في الانتخابات')

pos_tags = tagger.tag(sentence)

print(pos_tags)

['verb', 'noun_prop', 'prep', 'noun']


In [14]:
#Morphological Tokenization 
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

# The tokenizer expects pre-tokenized text
sentence = simple_word_tokenize('فتنفست الصعداء')
print(sentence)

# Load a pretrained disambiguator to use with a tokenizer
mle = MLEDisambiguator.pretrained('calima-msa-r13')

# Without providing additional arguments, the tokenizer will output undiacritized
# morphological tokens for each input word delimited by an underscore.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok')
tokens = tokenizer.tokenize(sentence)
print(tokens)

# By specifying `split=True`, the morphological tokens are output as seperate
# strings.
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True)
tokens = tokenizer.tokenize(sentence)
print(tokens)

# We can output diacritized tokens by setting `diac=True`
tokenizer = MorphologicalTokenizer(mle, scheme='d3tok', split=True, diac=True)
tokens = tokenizer.tokenize(sentence)
print(tokens)

['فتنفست', 'الصعداء']
['ف+_تنفست', 'ال+_صعداء']
['ف+', 'تنفست', 'ال+', 'صعداء']
['فَ+', 'تَنَفَّسْتُ', 'ال+', 'صُعَداءَ']


In [19]:
#Dialect Identification

from camel_tools.dialectid import DialectIdentifier

did = DialectIdentifier.pretrained()

sentences = [
    'مال الهوى و مالي شكون اللي جابني ليك  ما كنت انايا ف حالي بلاو قلبي يانا بيك',
    'بدي دوب قلي قلي بجنون بحبك انا مجنون ما بنسى حبك يوم'
]

predictions = did.predict(sentences, 'city')
print([p.top for p in predictions])

predictions = did.predict(sentences, 'country')
print([p.top for p in predictions])

predictions = did.predict(sentences, 'region')
print([p.top for p in predictions])



['Rabat', 'Beirut']
['Morocco', 'Lebanon']
['Maghreb', 'Levant']


In [18]:
#Sentiment Analysis
from camel_tools.sentiment import SentimentAnalyzer

sa = SentimentAnalyzer.pretrained()

sentences = [
    'أنا بخير',
    'أنا لست بخير'
]

sentiments = sa.predict(sentences)

print(sentiments)

['positive', 'negative']
