In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Practice Exercises

## Text Preprocessing with NLTK and spaCy

In [2]:
from string import punctuation

sample_paragraph = '''
Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.
'''

In [3]:
# NLTK
import nltk
import subprocess
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download("wordnet", download_dir='/kaggle/working/')
command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
subprocess.run(command.split())
nltk.data.path.append('/kaggle/working/')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

nltk_tokenized = word_tokenize(sample_paragraph)
nltk_output = [lemmatizer.lemmatize(token.lower().strip()) for token in nltk_tokenized] 
nltk_output = [word for word in nltk_output if word not in stop_words and word not in punctuation] 
print(nltk_output)
print(len(nltk_output))

[nltk_data] Downloading package wordnet to /kaggle/working/...
['natural', 'language', 'processing', 'nlp', 'field', 'computer', 'science', 'artificial', 'intelligence', 'computational', 'linguistics', 'concerned', 'interaction', 'computer', 'human', 'natural', 'language', 'particular', 'concerned', 'programming', 'computer', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpus', 'challenge', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'language', 'generation', 'frequently', 'formal', 'machine-readable', 'logical', 'form', 'connecting', 'language', 'machine', 'perception', 'managing', 'human-computer', 'dialog', 'system', 'combination', 'thereof']
54


In [4]:
# spacy

import spacy
from spacy.lang.en import stop_words
  
nlp = spacy.load("en_core_web_sm")
stop_words = stop_words.STOP_WORDS
  
spacy_res = nlp(sample_paragraph)
spacy_output = [token.lemma_.lower().strip() for token in spacy_res]
spacy_output = [word for word in spacy_output if word not in stop_words and word not in punctuation]
print(spacy_output)
print(len(spacy_output))

['natural', 'language', 'processing', 'nlp', 'field', 'computer', 'science', 'artificial', 'intelligence', 'computational', 'linguistic', 'concern', 'interaction', 'computer', 'human', 'natural', 'language', 'particular', 'concern', 'programming', 'computer', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', 'challenge', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'language', 'generation', 'frequently', 'formal', 'machine', 'readable', 'logical', 'form', 'connect', 'language', 'machine', 'perception', 'manage', 'human', 'computer', 'dialog', 'system', 'combination', 'thereof']
56


In [7]:
print(len(set(spacy_output)))
print(len((set(nltk_output))))

37
38


# NER

In [23]:
import spacy
from spacy import displacy

sample_paragraph = '''
A call for American independence from Britain,
the Virginia Declaration of Rights was drafted
by George Mason in May 1776
'''

nlp = spacy.load('en_core_web_sm')

text = nlp(sample_paragraph)

for word in text.ents:
    print(f"{word.text} --> {word.label_}")

American --> NORP
Britain --> GPE
the Virginia Declaration of Rights --> LAW
George Mason --> PERSON
May 1776 --> DATE


In [27]:
text.user_data["title"] = "Visualizing the named entities using displacy"
displacy.render(text, style="ent")

# Text Vectorization using Transformers

In [42]:
from transformers import AutoTokenizer, AutoModel
import torch

model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [44]:
inputs = tokenizer(sample_paragraph, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
hidden_states = outputs.last_hidden_state
word_embeddings = hidden_states.squeeze(0).numpy()
print("Shape:", word_embeddings.shape)
print(word_embeddings)

Shape: (23, 768)
[[-0.47192803 -0.08309454 -0.23628114 ... -0.5588365  -0.23242825
   0.45167467]
 [-0.29248115 -0.21373485 -0.8013402  ... -0.36817613 -0.02976469
   0.16252461]
 [-0.17263797 -0.48235747  0.04862721 ... -0.7239995  -0.12700519
  -0.7586211 ]
 ...
 [-0.8432871  -0.6246966  -0.60949767 ... -0.951172   -0.59483546
  -0.08086661]
 [ 0.3762518  -0.54389703 -0.28336224 ... -0.47434962 -0.44405395
   0.87907517]
 [ 0.47661605  0.06666544 -0.38012972 ... -0.031454   -0.6429229
   0.10792677]]


# Sentiment Analysis with Transformers

In [1]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [24]:
sentences = [
    "I love the new phone, it's absolutely amazing!",
    "The weather today is okay, nothing special.",
    "I hate waiting in long lines, it's so frustrating.",
    "nothing bad"
]

results = sentiment_pipeline(sentences)

for sent, res in zip(sentences, results):
    print(f"Sentence: {sent}\nPrediction: {res['label']}, Confidence: {res['score']:.4f}\n")

Sentence: I love the new phone, it's absolutely amazing!
Prediction: POSITIVE, Confidence: 0.9999

Sentence: The weather today is okay, nothing special.
Prediction: NEGATIVE, Confidence: 0.9952

Sentence: I hate waiting in long lines, it's so frustrating.
Prediction: NEGATIVE, Confidence: 0.9984

Sentence: nothing bad
Prediction: POSITIVE, Confidence: 0.9987



### Traditional text-processing approaches

In [25]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [26]:
# Perform sentiment analysis using Vader
for sentence in sentences:
    score = sia.polarity_scores(sentence)
    sentiment = "POSITIVE" if score['compound'] > 0 else "NEGATIVE" if score['compound'] < 0 else "NEUTRAL"
    print(f"Sentence: {sentence}\nPrediction: {sentiment}, Confidence: {score['compound']}\n")

Sentence: I love the new phone, it's absolutely amazing!
Prediction: POSITIVE, Confidence: 0.862

Sentence: The weather today is okay, nothing special.
Prediction: NEGATIVE, Confidence: -0.092

Sentence: I hate waiting in long lines, it's so frustrating.
Prediction: NEGATIVE, Confidence: -0.8147

Sentence: nothing bad
Prediction: POSITIVE, Confidence: 0.431

