In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

# Basic NLP

In [34]:
# Data Cleaning

import re

#sample review from the IMDB dataset.
ar_review = "<b>ماما نورة</b> مكان جميل للأكل المنزلي اللذيذ! الأطباق متنوعة والعيش طازج. الخدمة ممتازة والجلسات مريحة. أنصح بتجربته."


cleaned_review_ar = re.sub(re.compile('<.*?>'), '', ar_review)  # Remove HTML tags
cleaned_review_ar = re.sub(r'[^ا-ي0-9\s]+', ' ', cleaned_review_ar)  # Remove non-Arabic characters and digits, keeping spaces

cleaned_review_ar

'ماما نورة مكان جميل لل كل المنزلي اللذيذ  ال طباق متنوعة والعيش طازج  الخدمة ممتازة والجلسات مريحة   نصح بتجربته '

In [35]:
# Data Cleaning

import re

#sample review from the IMDB dataset.
ar_review = "<b>ماما نورة</b> مكان جميل للأكل المنزلي اللذيذ! الأطباق متنوعة والعيش طازج. الخدمة ممتازة والجلسات مريحة. أنصح بتجربته."


cleaned_review_ar = re.sub(re.compile('<.*?>'), '', ar_review)  # Remove HTML tags
cleaned_review_ar = re.sub(r'[^\u0600-\u06FF]', ' ', cleaned_review_ar)  # Remove non-Arabic characters and digits, keeping spaces

cleaned_review_ar

'ماما نورة مكان جميل للأكل المنزلي اللذيذ  الأطباق متنوعة والعيش طازج  الخدمة ممتازة والجلسات مريحة  أنصح بتجربته '

In [36]:
#Lowercase

cleaned_review = cleaned_review.lower()

print(cleaned_review)

a touching movie it is full of emotions and wonderful acting i could have sat through it a second time 


In [6]:
# Tokenization

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:


from nltk.tokenize import word_tokenize

tokens = nltk.word_tokenize(cleaned_review)

print(cleaned_review)
print(tokens)

a touching movie it is full of emotions and wonderful acting i could have sat through it a second time 
['a', 'touching', 'movie', 'it', 'is', 'full', 'of', 'emotions', 'and', 'wonderful', 'acting', 'i', 'could', 'have', 'sat', 'through', 'it', 'a', 'second', 'time']


In [8]:
# Tokenization
ar_tokens = nltk.word_tokenize(cleaned_review_ar)

print(cleaned_review_ar)
print(ar_tokens)

ماما نورة مكان جميل للأكل المنزلي اللذيذ  الأطباق متنوعة والعيش طازج  الخدمة ممتازة والجلسات مريحة  أنصح بتجربته 
['ماما', 'نورة', 'مكان', 'جميل', 'للأكل', 'المنزلي', 'اللذيذ', 'الأطباق', 'متنوعة', 'والعيش', 'طازج', 'الخدمة', 'ممتازة', 'والجلسات', 'مريحة', 'أنصح', 'بتجربته']


In [9]:
ar_tokens

['ماما',
 'نورة',
 'مكان',
 'جميل',
 'للأكل',
 'المنزلي',
 'اللذيذ',
 'الأطباق',
 'متنوعة',
 'والعيش',
 'طازج',
 'الخدمة',
 'ممتازة',
 'والجلسات',
 'مريحة',
 'أنصح',
 'بتجربته']

In [10]:
# Stop words removal

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

filtered_review = [word for word in tokens if word not in stop_words] # removing stop words

print(filtered_review)

['touching', 'movie', 'full', 'emotions', 'wonderful', 'acting', 'could', 'sat', 'second', 'time']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
stop_words.append('Hamza')

In [12]:
# Arabic Stop words removal

stop_words = stopwords.words('arabic')

filtered_ar_review = [word for word in ar_tokens if word not in stop_words] # removing stop words

print(filtered_ar_review)

['ماما', 'نورة', 'مكان', 'جميل', 'للأكل', 'المنزلي', 'اللذيذ', 'الأطباق', 'متنوعة', 'والعيش', 'طازج', 'الخدمة', 'ممتازة', 'والجلسات', 'مريحة', 'أنصح', 'بتجربته']


In [13]:
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:


lemmatizer = WordNetLemmatizer()

lemm_review = [lemmatizer.lemmatize(word) for word in filtered_review]

print(lemm_review)

['touching', 'movie', 'full', 'emotion', 'wonderful', 'acting', 'could', 'sat', 'second', 'time']


In [15]:
from nltk.stem.isri import ISRIStemmer


ar_lemmatizer = ISRIStemmer()

lemm_ar_review = [ar_lemmatizer.stem(word) for word in filtered_ar_review]

print(lemm_ar_review)

['اما', 'نور', 'كان', 'جمل', 'اكل', 'زلي', 'لذذ', 'طبق', 'تنع', 'عيش', 'طزج', 'خدم', 'متز', 'جلس', 'ريح', 'نصح', 'جرب']


In [17]:
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:

text = "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)
print (pos_tag(tokens))

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('on', 'IN'), ('Analytics', 'NNP'), ('Vidhya', 'NNP')]


In [19]:


text = "إغلاق المتصفحات الغير ضرورية"

tokens = word_tokenize(text)
print (pos_tag(tokens))

[('إغلاق', 'JJ'), ('المتصفحات', 'NNP'), ('الغير', 'NNP'), ('ضرورية', 'NN')]


# Spacy

In [24]:
import os
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy

In [25]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [26]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x29ee02f80>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x29ee02e60>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x29875d540>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x29f1e8b80>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x29f1c6a40>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x29875d380>)]


In [27]:
text = """Looking for a hotel in New York near Times Square with free breakfast and cheaper
than $100 for 2nd June which is really kids friendly and has a swimming pool and I want to stay there for 8 days"""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")

In [28]:
text = """Close to the Effiel Tower and is very high end with great shopping nearby"""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")

In [29]:
text = "I want to stay in a European city that filmed Game of Thrones and has very cheap booze and art galleries for 4 days"

In [30]:
#text = """My very photogenic mother died in a freak accident (picnic, lightning) when I was three, and, save for a pocket of warmth in the darkest past, nothing of her subsists within the hollows and dells of memory, over which, if you can still stand my style (I am writing under observation), the sun of my infancy had set: surely, you all know those redolent remnants of day suspended, with the midges, about some hedge in bloom or suddenly entered and traversed by the rambler, at the bottom of a hill, in the summer dusk; a furry warmth, golden midges"""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")

In [32]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [38]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'