In [1]:
## difference in NLTK, SPACY AND HUGGINGFACE
##  Corpus- The data over which our system learns to be informed enough to be better equipped for further set of actions
text = """
Google announced a major partnership with NASA in October 2023 to develop advanced AI systems for space research.
The collaboration aims to improve satellite image analysis and automate spacecraft navigation.
Sundar Pichai said the project will help expand the boundaries of AI and space exploration.
"""


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")  ## pre-trained English language model

# Process the text
doc = nlp(text)


In [3]:
doc


Google announced a major partnership with NASA in October 2023 to develop advanced AI systems for space research.
The collaboration aims to improve satellite image analysis and automate spacecraft navigation.
Sundar Pichai said the project will help expand the boundaries of AI and space exploration.

In [4]:
print("Tokens:")
for token in doc:
    print(token.text)


🔹 Tokens:


Google
announced
a
major
partnership
with
NASA
in
October
2023
to
develop
advanced
AI
systems
for
space
research
.


The
collaboration
aims
to
improve
satellite
image
analysis
and
automate
spacecraft
navigation
.


Sundar
Pichai
said
the
project
will
help
expand
the
boundaries
of
AI
and
space
exploration
.




In [5]:
print("\n🔹 Parts of Speech:")
for token in doc:
    print(f"{token.text:15} → {token.pos_:10} ({token.tag_})")



🔹 Parts of Speech:

               → SPACE      (_SP)
Google          → PROPN      (NNP)
announced       → VERB       (VBD)
a               → DET        (DT)
major           → ADJ        (JJ)
partnership     → NOUN       (NN)
with            → ADP        (IN)
NASA            → PROPN      (NNP)
in              → ADP        (IN)
October         → PROPN      (NNP)
2023            → NUM        (CD)
to              → PART       (TO)
develop         → VERB       (VB)
advanced        → ADJ        (JJ)
AI              → PROPN      (NNP)
systems         → NOUN       (NNS)
for             → ADP        (IN)
space           → NOUN       (NN)
research        → NOUN       (NN)
.               → PUNCT      (.)

               → SPACE      (_SP)
The             → DET        (DT)
collaboration   → NOUN       (NN)
aims            → VERB       (VBZ)
to              → PART       (TO)
improve         → VERB       (VB)
satellite       → NOUN       (NN)
image           → NOUN       (NN)
analysis        → NO

In [7]:
print("\n Dependency Relations:")
for token in doc:
    print(f"{token.text:15} → {token.dep_:12} (Head: {token.head.text})")



🔹 Dependency Relations:

               → dep          (Head: Google)
Google          → nsubj        (Head: announced)
announced       → ROOT         (Head: announced)
a               → det          (Head: partnership)
major           → amod         (Head: partnership)
partnership     → dobj         (Head: announced)
with            → prep         (Head: partnership)
NASA            → pobj         (Head: with)
in              → prep         (Head: announced)
October         → pobj         (Head: in)
2023            → nummod       (Head: October)
to              → aux          (Head: develop)
develop         → advcl        (Head: announced)
advanced        → amod         (Head: systems)
AI              → compound     (Head: systems)
systems         → dobj         (Head: develop)
for             → prep         (Head: systems)
space           → compound     (Head: research)
research        → pobj         (Head: for)
.               → punct        (Head: announced)

               → dep  

In [1]:
import re

sample_text = "I love Natural Language Processing! It's a foundational step. Let's explore word and subword tokens."

print("--- Sample Text ---")
print(sample_text)
print("-" * 50)

# 1. Sentence Tokenization (Divides text into sentences) [cite: 532, 533]
# Splits by common sentence-ending punctuation (.!?) followed by a whitespace.
sentence_tokens = re.split(r'(?<=[.!?])\s+', sample_text)
print("1. Sentence Tokenization (Output):")
print(sentence_tokens)
print("-" * 50)

# 2. Word Tokenization (Splits text into individual words and punctuation) [cite: 528, 529]
# Finds sequences of word characters or single non-whitespace/non-word characters.
word_tokens = re.findall(r"\b\w+\b|[^ \w]", sample_text)
print("2. Word Tokenization (Output):")
print(word_tokens)
print("-" * 50)

# 3. Subword Tokenization Concept (Illustrating the concept of breaking down a word) [cite: 536, 537]
# Subword tokenization typically uses complex models (like BPE or WordPiece)[cite: 540, 545].
# This snippet uses simple string slicing to illustrate the concept of breaking a word into smaller, meaningful units (e.g., prefix and root).
word_for_subword = "unhappiness"
subword_tokens_concept = [word_for_subword[:2], word_for_subword[2:]] # Matches the example: ["un", "happiness"] [cite: 538]
print(f"3. Subword Tokenization (Concept for '{word_for_subword}'):")
print(subword_tokens_concept)

--- Sample Text ---
I love Natural Language Processing! It's a foundational step. Let's explore word and subword tokens.
--------------------------------------------------
1. Sentence Tokenization (Output):
['I love Natural Language Processing!', "It's a foundational step.", "Let's explore word and subword tokens."]
--------------------------------------------------
2. Word Tokenization (Output):
['I', 'love', 'Natural', 'Language', 'Processing', '!', 'It', "'", 's', 'a', 'foundational', 'step', '.', 'Let', "'", 's', 'explore', 'word', 'and', 'subword', 'tokens', '.']
--------------------------------------------------
3. Subword Tokenization (Concept for 'unhappiness'):
['un', 'happiness']


In [4]:
import spacy
from transformers import AutoTokenizer
import string
import re

# --- Setup: Load Models ---
# NOTE: In a real environment, you must run:
# 1. pip install spacy transformers sentencepiece
# 2. python -m spacy download en_core_web_sm
try:
    # 1. Load spaCy model for Word Tokenization, Lemmatization, and Filtering
    nlp = spacy.load("en_core_web_sm")

    # 2. Load WordPiece (BERT) and BPE (GPT-2) tokenizers for subword examples
    wordpiece_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")

    # 3. Load SentencePiece (T5) tokenizer for demonstration
    # T5 models use SentencePiece, which explicitly handles the space character (represented by ' ')
    sentencepiece_tokenizer = AutoTokenizer.from_pretrained("t5-base")

except Exception as e:
    print(f"ERROR: Could not load required NLP models. Ensure 'spacy', 'transformers', and 'sentencepiece' are installed.")
    print("Execution will continue with a placeholder list for clarity.")

# --- Sample Text ---
sample_text = "The cars are running fast, and they were better than the old models. We need NLP now!"

# Process the text with the spaCy model
doc = nlp(sample_text)

print(f"Original Text:\n{sample_text}\n")
print("=" * 60)
print("A. CLASSIC TOKENIZATION RESULTS")
print("=" * 60)

# --- 1. CLASSIC: SENTENCE TOKENIZATION ---
sentence_tokens = [sent.text for sent in doc.sents]
print("1. Sentence Tokenization:")
print(f"  {sentence_tokens}")

# --- 2. CLASSIC: WORD TOKENIZATION ---
word_tokens = [token.text for token in doc]
print("\n2. Word Tokenization (spaCy's internal tokens):")
print(f"  {word_tokens}")

print("\n" + "=" * 60)
print("B. ADVANCED SUBWORD TOKENIZATION RESULTS")
print("=" * 60)

# --- 3a. SUBWORD: WORDPIECE (Used by BERT) ---
# Note the '##' prefix for continuation subwords, and lowercasing for BERT's tokenizer.
wordpiece_tokens = wordpiece_tokenizer.tokenize(sample_text.lower())
print("3a. WordPiece Tokenization (BERT Concept):")
print(f"  {wordpiece_tokens}")

# --- 3b. SUBWORD: BPE (Byte Pair Encoding, Used by GPT) ---
# Note the 'Ġ' prefix for spaces (default for GPT-2 tokenizers).
bpe_tokens = bpe_tokenizer.tokenize(sample_text)
print("\n3b. BPE Tokenization (GPT-2 Concept):")
print(f"  {bpe_tokens}")

# --- 3c. SUBWORD: SENTENCEPIECE (Used by T5, Language-Agnostic) ---
# Note the explicit ' ' (space) character at the start of tokens.
sentencepiece_tokens = sentencepiece_tokenizer.tokenize(sample_text)
print("\n3c. SentencePiece Tokenization (T5 Concept):")
print(f"  {sentencepiece_tokens}")

print("\n" + "=" * 60)
print("C. TEXT CLEANING PIPELINE")
print("=" * 60)

# --- 4. LEMMATIZATION, STOP WORD, & PUNCTUATION REMOVAL ---
# Filters tokens using spaCy's built-in attributes and extracts the base form (lemma).
lemmatized_tokens = [
    token.lemma_
    for token in doc
    # Filters out stop words, punctuation, and whitespace
    if not token.is_stop and not token.is_punct and not token.is_space
]

print("Tokens (Lemmatized & Cleaned):")
print(f"  {lemmatized_tokens}\n")

# --- 5. FINAL CLEANED STRING ---
final_cleaned_text = " ".join(lemmatized_tokens)
print("Final Cleaned String:")
print(f"  {final_cleaned_text}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Original Text:
The cars are running fast, and they were better than the old models. We need NLP now!

A. CLASSIC TOKENIZATION RESULTS
1. Sentence Tokenization:
  ['The cars are running fast, and they were better than the old models.', 'We need NLP now!']

2. Word Tokenization (spaCy's internal tokens):
  ['The', 'cars', 'are', 'running', 'fast', ',', 'and', 'they', 'were', 'better', 'than', 'the', 'old', 'models', '.', 'We', 'need', 'NLP', 'now', '!']

B. ADVANCED SUBWORD TOKENIZATION RESULTS
3a. WordPiece Tokenization (BERT Concept):
  ['the', 'cars', 'are', 'running', 'fast', ',', 'and', 'they', 'were', 'better', 'than', 'the', 'old', 'models', '.', 'we', 'need', 'nl', '##p', 'now', '!']

3b. BPE Tokenization (GPT-2 Concept):
  ['The', 'Ġcars', 'Ġare', 'Ġrunning', 'Ġfast', ',', 'Ġand', 'Ġthey', 'Ġwere', 'Ġbetter', 'Ġthan', 'Ġthe', 'Ġold', 'Ġmodels', '.', 'ĠWe', 'Ġneed', 'ĠN', 'LP', 'Ġnow', '!']

3c. SentencePiece Tokenization (T5 Concept):
  ['▁The', '▁cars', '▁are', '▁running', '▁fa

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical # Used for the actual encoding

# Sample Data
data = ['I love NLP', 'NLP is fun', 'I hate spam']
vocabulary = sorted(list(set(' '.join(data).split())))
# Vocabulary: ['I', 'NLP', 'fun', 'hate', 'is', 'love', 'spam']

# 1. Create a dictionary mapping words to integer indices
word_to_int = {word: i for i, word in enumerate(vocabulary)}
# word_to_int: {'I': 0, 'NLP': 1, 'fun': 2, 'hate': 3, 'is': 4, 'love': 5, 'spam': 6}

# 2. Encode a sample word ("love")
sample_word = "NLP"
int_encoded = word_to_int[sample_word]

# 3. One-Hot Encode the integer
one_hot_vector = to_categorical([int_encoded], num_classes=len(vocabulary))[0]

print(f"Vocabulary Size: {len(vocabulary)}")
print(f"Sample Word: '{sample_word}' (Index: {int_encoded})")
print(f"One-Hot Vector:\n{one_hot_vector}")

# Output structure (for 'NLP'): [0. 0. 0. 0. 0. 1. 0.]

Vocabulary Size: 7
Sample Word: 'NLP' (Index: 1)
One-Hot Vector:
[0. 1. 0. 0. 0. 0. 0.]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample Documents
documents = [
    'The quick brown fox jumps over the lazy dog.',
    'A quick fox is not a lazy dog.'
]

# 1. Initialize the CountVectorizer
# stop_words='english' removes common words like 'the', 'a', 'is'
vectorizer = CountVectorizer(stop_words='english')

# 2. Fit the vocabulary and Transform the documents
bow_matrix = vectorizer.fit_transform(documents)

# Convert the sparse matrix to an array for printing
bow_array = bow_matrix.toarray()
vocabulary = vectorizer.get_feature_names_out()

print(f"Vocabulary: {vocabulary}")
print(f"BoW Matrix (Counts):\n{bow_array}")

# Output structure (counts of unique, non-stop words):
# Doc 1: [brown, dog, fox, jumps, lazy, quick]
# Doc 2: [dog, fox, lazy, quick]

Vocabulary: ['brown' 'dog' 'fox' 'jumps' 'lazy' 'quick']
BoW Matrix (Counts):
[[1 1 1 1 1 1]
 [0 1 1 0 1 1]]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Documents (same as BoW)
documents = [
    'The quick brown fox jumps over the lazy dog.',
    'A quick fox is not a lazy dog.'
]

# 1. Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# 2. Fit the vocabulary and Transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the sparse matrix to an array for printing (shows weights)
tfidf_array = tfidf_matrix.toarray()
vocabulary = vectorizer.get_feature_names_out()

print(f"Vocabulary: {vocabulary}")
print(f"TF-IDF Matrix (Weights):\n{tfidf_array.round(2)}")

# Output structure (TF-IDF weights for unique, non-stop words):
# Note how 'brown' has a high weight in Doc 1 because it's unique.

Vocabulary: ['brown' 'dog' 'fox' 'jumps' 'lazy' 'quick']
TF-IDF Matrix (Weights):
[[0.5  0.35 0.35 0.5  0.35 0.35]
 [0.   0.5  0.5  0.   0.5  0.5 ]]


In [10]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [16]:
from gensim.models import Word2Vec

sentences = [["i","love","natural","language","processing"],
             ["word2vec","creates","word","embeddings"]]

model = Word2Vec(sentences, vector_size=50, window=5, min_count=1)
print(model.wv.most_similar('love'))


[('i', 0.18458431959152222), ('processing', 0.13661059737205505), ('embeddings', 0.13204392790794373), ('natural', 0.11253627389669418), ('word2vec', 0.04491730034351349), ('language', 0.029594358056783676), ('word', -0.1754782348871231), ('creates', -0.21872937679290771)]
