#1) Text Preprocessing with NLTK and spaCy

#NLTK solution

In [45]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')


sample_text = """Keith recently came back from a trip to Chicago, Illinois. This midwestern metropolis is found along the shore of Lake Michigan.
During his visit, Keith spent a lot of time exploring the city to visit important landmarks and monuments."""

tokenized_text_nltk = word_tokenize(sample_text)
lemmatized_text_nltk = [nltk.stem.WordNetLemmatizer().lemmatize(word) for word in tokenized_text_nltk]
stop_words_removed_nltk = [word for word in lemmatized_text_nltk if word not in nltk.corpus.stopwords.words('english')]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
print(f"tokenized by NLTK text is: \n {tokenized_text_nltk}\n")
print(f"lemmatized text is: \n {lemmatized_text_nltk}\n")
print(f"without stop words text is: \n {stop_words_removed_nltk}\n")

tokenized by NLTK text is: 
 ['Keith', 'recently', 'came', 'back', 'from', 'a', 'trip', 'to', 'Chicago', ',', 'Illinois', '.', 'This', 'midwestern', 'metropolis', 'is', 'found', 'along', 'the', 'shore', 'of', 'Lake', 'Michigan', '.', 'During', 'his', 'visit', ',', 'Keith', 'spent', 'a', 'lot', 'of', 'time', 'exploring', 'the', 'city', 'to', 'visit', 'important', 'landmarks', 'and', 'monuments', '.']

lemmatized text is: 
 ['Keith', 'recently', 'came', 'back', 'from', 'a', 'trip', 'to', 'Chicago', ',', 'Illinois', '.', 'This', 'midwestern', 'metropolis', 'is', 'found', 'along', 'the', 'shore', 'of', 'Lake', 'Michigan', '.', 'During', 'his', 'visit', ',', 'Keith', 'spent', 'a', 'lot', 'of', 'time', 'exploring', 'the', 'city', 'to', 'visit', 'important', 'landmark', 'and', 'monument', '.']

without stop words text is: 
 ['Keith', 'recently', 'came', 'back', 'trip', 'Chicago', ',', 'Illinois', '.', 'This', 'midwestern', 'metropolis', 'found', 'along', 'shore', 'Lake', 'Michigan', '.', 'Dur

#Spacy solution

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

def lemmatize_text(text):
  doc = nlp(text)
  lemmatized_tokens = [token.lemma_ for token in doc]
  return lemmatized_tokens

def stop_words_removing(text):
  doc = nlp(text)
  filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
  return filtered_tokens

tokenized_text_spacy = tokenize_text(sample_text)
lemmatized_text_spacy = lemmatize_text(sample_text)
stop_words_removed_spacy = stop_words_removing(sample_text)

In [4]:
print(f"tokenized by SPACY text is: \n {tokenized_text_spacy}\n")
print(f"lemmatized text is: \n {lemmatized_text_spacy}\n")
print(f"without stop words text is: \n {stop_words_removed_spacy}\n")

tokenized by SPACY text is: 
 ['Keith', 'recently', 'came', 'back', 'from', 'a', 'trip', 'to', 'Chicago', ',', 'Illinois', '.', 'This', 'midwestern', 'metropolis', 'is', 'found', 'along', 'the', 'shore', 'of', 'Lake', 'Michigan', '.', '\n', 'During', 'his', 'visit', ',', 'Keith', 'spent', 'a', 'lot', 'of', 'time', 'exploring', 'the', 'city', 'to', 'visit', 'important', 'landmarks', 'and', 'monuments', '.']

lemmatized text is: 
 ['Keith', 'recently', 'come', 'back', 'from', 'a', 'trip', 'to', 'Chicago', ',', 'Illinois', '.', 'this', 'midwestern', 'metropolis', 'be', 'find', 'along', 'the', 'shore', 'of', 'Lake', 'Michigan', '.', '\n', 'during', 'his', 'visit', ',', 'Keith', 'spend', 'a', 'lot', 'of', 'time', 'explore', 'the', 'city', 'to', 'visit', 'important', 'landmark', 'and', 'monument', '.']

without stop words text is: 
 ['Keith', 'recently', 'came', 'trip', 'Chicago', 'Illinois', 'midwestern', 'metropolis', 'found', 'shore', 'Lake', 'Michigan', '\n', 'visit', 'Keith', 'spent', '

#2) Named Entity Recognition (NER) with spaCy

In [5]:
import spacy
from spacy import displacy

text = "Far far away, behind the word mountains, far from the countries Vokalia and Consonantia, there live the blind texts."

nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_text)

print("Named Entities, Phrases, and Concepts:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

displacy.serve(doc, style="ent")



Named Entities, Phrases, and Concepts:
Chicago (GPE)
Illinois (GPE)
Lake Michigan (LOC)
Keith (PERSON)





Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#3) Text Vectorization using Transformers

In [22]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased", output_hidden_states = True)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing Be

In [47]:
tokens = tokenizer.tokenize(sample_text)
encodings = tokenizer.encode(sample_text)

In [24]:
tokens, encodings

(['keith',
  'recently',
  'came',
  'back',
  'from',
  'a',
  'trip',
  'to',
  'chicago',
  ',',
  'illinois',
  '.',
  'this',
  'midwest',
  '##ern',
  'metropolis',
  'is',
  'found',
  'along',
  'the',
  'shore',
  'of',
  'lake',
  'michigan',
  '.',
  'during',
  'his',
  'visit',
  ',',
  'keith',
  'spent',
  'a',
  'lot',
  'of',
  'time',
  'exploring',
  'the',
  'city',
  'to',
  'visit',
  'important',
  'landmarks',
  'and',
  'monuments',
  '.'],
 [101,
  6766,
  3728,
  2234,
  2067,
  2013,
  1037,
  4440,
  2000,
  3190,
  1010,
  4307,
  1012,
  2023,
  13608,
  11795,
  18236,
  2003,
  2179,
  2247,
  1996,
  5370,
  1997,
  2697,
  4174,
  1012,
  2076,
  2010,
  3942,
  1010,
  6766,
  2985,
  1037,
  2843,
  1997,
  2051,
  11131,
  1996,
  2103,
  2000,
  3942,
  2590,
  16209,
  1998,
  10490,
  1012,
  102])

In [51]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
encoded_input = tokenizer(sample_text, return_tensors='pt')
output = model(**encoded_input)

output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3987,  0.0766, -0.7748,  ..., -0.0594,  0.6937, -0.4125],
         [ 0.0349, -0.5916,  0.0343,  ...,  0.0782,  0.2416, -0.4458],
         [ 0.2022, -0.4534,  0.0832,  ...,  0.0120,  0.0234, -0.2858],
         ...,
         [ 1.0440,  0.9648, -0.1495,  ..., -0.0479, -0.4928, -0.1610],
         [-0.2571, -0.3461, -0.1741,  ...,  0.3672,  0.1976, -0.4908],
         [-0.4563, -0.1745, -0.4159,  ...,  0.4302, -0.1528, -1.3966]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-7.6195e-01, -5.6361e-01, -9.9172e-01,  6.4513e-01,  8.3386e-01,
         -1.5436e-01,  5.3941e-01,  3.7694e-01, -9.5500e-01, -9.9994e-01,
         -7.7293e-01,  9.7995e-01,  9.7832e-01,  8.1757e-01,  8.0023e-01,
         -5.5091e-01, -2.7588e-01, -5.3327e-01,  2.8927e-01,  6.7363e-01,
          7.7889e-01,  1.0000e+00, -4.6736e-01,  3.6031e-01,  5.6643e-01,
          9.9718e-01, -8.6944e-01,  8.7331e-01,  9.1776e-01,  5.840

#4) Sentiment Analysis with Transformers

In [44]:
from transformers import pipeline

pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

text = "I love this movie!"
result = pipe(text)
print(f'{text} {result}')

text2 = "I hate it"
result = pipe(text2)
print(f'{text2} {result}')

Device set to use cpu


I love this movie! [{'label': 'POSITIVE', 'score': 0.9998775720596313}]
I hate it [{'label': 'NEGATIVE', 'score': 0.9996398687362671}]
