In [1]:
import re
import string
from camel_tools.tokenizers.word import simple_word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter


In [2]:
import camel_tools

In [6]:
help(camel_tools)

Help on package camel_tools:

NAME
    camel_tools

DESCRIPTION
    A suite of Arabic natural language processing tools developed by the CAMeL Lab
    at New York University Abu Dhabi.

PACKAGE CONTENTS
    cli (package)
    data (package)
    dialectid (package)
    disambig (package)
    morphology (package)
    ner (package)
    sentiment (package)
    tagger (package)
    tokenizers (package)
    utils (package)

DATA
    absolute_import = _Feature((2, 5, 0, 'alpha', 1), (3, 0, 0, 'alpha', 0...
    print_function = _Feature((2, 6, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0)...
    version_file = r'C:\Users\User\anaconda3\lib\site-packages\camel_tools...
    version_fp = <_io.TextIOWrapper name='C:\\Users\\User\\anacon...\camel...

VERSION
    1.2.0

FILE
    c:\users\user\anaconda3\lib\site-packages\camel_tools\__init__.py




## Cleaning

In [14]:
def remove_numbers_punctuations (text):
    clean_text = re.sub('\w*\d\w*', ' ', str(text))
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', clean_text)
    return clean_text

In [15]:
def ar_text_correction (text):
    correct_text = re.sub(r'،|؛', ' ', str(text))
    correct_text = re.sub(r'\s[A-Z]{1}\s|\s[a-z]{1}\s', ' ', str(correct_text))
    correct_text = re.sub(r'\s[ا-ي]\s', ' ', str(correct_text))
    correct_text = re.sub(r'(et)\s|(al)', ' ', str(correct_text))
    correct_text = re.sub(r'لإل','الإ', str(correct_text))
    correct_text = re.sub(r'لأل','الأ', str(correct_text))
    correct_text = re.sub(r'لال','الا', str(correct_text))
    correct_text = re.sub(r'األ','الأ', str(correct_text))
    correct_text = re.sub(r'اإل','الإ', str(correct_text))
    correct_text = re.sub(r'اال','الا', str(correct_text))
    correct_text = re.sub(r'الل','لال', str(correct_text))
    correct_text = re.sub(r'الاا','لالا', str(correct_text))
    return correct_text

In [3]:
def only_ar_text(text):
    ar_text = re.sub(r"[A-Z]['\w]*|[a-z]['\w]*", ' ', str(text))
    return ar_text

## Tokenization

In [4]:
def tokenize(text):
    tokens = simple_word_tokenize(text)
    return tokens

##  Removing stop words 

In [5]:
def remove_stop_words(tokens):
    filtered_text = [t for t in tokens if not t in stopwords.words("arabic")]
    filtered_text = [t for t in filtered_text if not t in stopwords.words("english")]
    return filtered_text

## Vectorization

In [6]:
def vectorizer(filtered_text):
    cv = CountVectorizer()
    transformed_text = cv.fit_transform(filtered_text)
    vector_table = pd.DataFrame(transformed_text.toarray(), columns=cv.get_feature_names())
    return vector_table

## word counts

In [7]:
def word_counter(word_list):
    counts = Counter(word_list)
    # Reverse the key/values in the dictionary for sorting
    word_counts = list(zip(counts.values(), counts.keys()))

    # Sort the list by count
    word_counts = sorted(word_counts, reverse=True)
    return word_counts

In [8]:
def visualize_counts(counts):
    word_lengths = pd.Series([len(x) for x in counts])

    ax = word_lengths.hist(bins=15)
    ax.set(xlabel='Word Lengths', ylabel='Frequency', title='Distribution of Word Length')

## Document Similarity:

In [9]:
with open (r'C:\Users\User\Dropbox\PC\Documents\python\articles\artic14.txt', 'r', encoding='utf-8') as file:
    content = file.read()
file.close()

In [12]:
with open (r'C:\Users\User\Dropbox\PC\Documents\python\articles\artic15.txt', 'r', encoding='utf-8') as file:
    content2 = file.read()
file.close()

In [13]:
with open (r'C:\Users\User\Dropbox\PC\Documents\python\articles\artic16.txt', 'r', encoding='utf-8') as file:
    content3 = file.read()
file.close()

In [62]:
content = remove_numbers_punctuations(content)
content = ar_text_correction(content)
content = only_ar_text(content)
content = tokenize(content)
content = remove_stop_words(content)
# content = vectorizer(content)
content_counter = word_counter(content)

In [66]:
clean_content = " ".join(content)

In [None]:
clean_content

In [68]:
content2 = remove_numbers_punctuations(content2)
content2 = ar_text_correction(content2)
content2 = only_ar_text(content2)
content2 = tokenize(content2)
content2 = remove_stop_words(content2)
# content2 = vectorizer(content2)
content2_counter = word_counter(content2)

In [71]:
clean_content2 = " ".join(content2)

In [69]:
content3 = remove_numbers_punctuations(content3)
content3 = ar_text_correction(content3)
content3 = only_ar_text(content3)
content3 = tokenize(content3)
content3 = remove_stop_words(content3)
# content3 = vectorizer(content3)
content3_counter = word_counter(content3)

In [72]:
clean_content3 = " ".join(content3)

In [None]:
corpus = [clean_content, clean_content2, clean_content3]
corpus

## Bag of Words Model

In [None]:
vectorizer(corpus)

In [75]:
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
# list all of the combinations of 5 take 2 as well as the pairs of phrases
pairs = list(combinations(range(len(corpus)),2))
combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]

cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
# calculate the cosine similarity for all pairs of phrases and sort by most similar
results = [cosine_similarity([X[a_index]], [X[b_index]]) for (a_index, b_index) in
pairs]
sorted(zip(results, combos), reverse=True)

[array([[0.21648862]]), array([[0.13334312]]), array([[0.11125456]])]

## TF-IDF

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv_tfidf = TfidfVectorizer()
X_tfidf = cv_tfidf.fit_transform(corpus).toarray()
pd.DataFrame(X_tfidf, columns=cv_tfidf.get_feature_names())

In [None]:
results_tfidf = [cosine_similarity([X_tfidf[a_index]], [X_tfidf[b_index]]) for (a_index, b_index) in pairs]
sorted(zip(results_tfidf, combos), reverse=True)

In [None]:
content_counter = word_counter(vector)
#clean_content = " ".join(content6)
content_counter[:]