In [1]:
import nltk
import spacy

nlp = spacy.load("en_core_web_sm")

### Handling Missing Values

In [2]:
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'Age': [25, 30, np.nan, 35],
    'Salary': [50000, np.nan, 60000, 70000]
}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Impute missing values with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Display the DataFrame after imputation
print("\nDataFrame after Mean Imputation:")
print(df)


Original DataFrame:
    Age   Salary
0  25.0  50000.0
1  30.0      NaN
2   NaN  60000.0
3  35.0  70000.0

DataFrame after Mean Imputation:
    Age   Salary
0  25.0  50000.0
1  30.0  60000.0
2  30.0  60000.0
3  35.0  70000.0


### Part of Speech Tagging
Process of assigning a part of speech to each word in a sentence. 
Common POS tags include nouns, verbs, adjectives, adverbs etc.


##### NLTK

In [3]:
text = "NLTK is a leading platform for building Python programs to work with human language data."

words = nltk.word_tokenize(text)

pos_tags_nltk = nltk.pos_tag(words)

print(pos_tags_nltk)

[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('leading', 'VBG'), ('platform', 'NN'), ('for', 'IN'), ('building', 'VBG'), ('Python', 'NNP'), ('programs', 'NNS'), ('to', 'TO'), ('work', 'VB'), ('with', 'IN'), ('human', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('.', '.')]


##### Spacy

In [4]:
text = "spaCy is an open-source library for advanced Natural Language Processing in Python."

doc = nlp(text)

pos_tags_spacy = [(token.text, token.pos_, token.tag_) for token in doc]
print(pos_tags_spacy)

[('spaCy', 'INTJ', 'UH'), ('is', 'AUX', 'VBZ'), ('an', 'DET', 'DT'), ('open', 'ADJ', 'JJ'), ('-', 'PUNCT', 'HYPH'), ('source', 'NOUN', 'NN'), ('library', 'NOUN', 'NN'), ('for', 'ADP', 'IN'), ('advanced', 'ADJ', 'JJ'), ('Natural', 'PROPN', 'NNP'), ('Language', 'PROPN', 'NNP'), ('Processing', 'PROPN', 'NNP'), ('in', 'ADP', 'IN'), ('Python', 'PROPN', 'NNP'), ('.', 'PUNCT', '.')]


### Chunking
Chunking (or shallow parsing) takes the POS-tagged words and groups them into larger units such as noun phrases (NP), verb phrases (VP), etc

##### NLTK

In [5]:
# Define a chunk grammar
grammar = "NP: {<DT>?<JJ>*<NN>}"

# Create a chunk parser
cp = nltk.RegexpParser(grammar)

# Parse the sentence to get chunks
tree = cp.parse(pos_tags_nltk)

# Print the chunk tree
print(tree)
tree.draw()

(S
  NLTK/NNP
  is/VBZ
  a/DT
  leading/VBG
  (NP platform/NN)
  for/IN
  building/VBG
  Python/NNP
  programs/NNS
  to/TO
  work/VB
  with/IN
  (NP human/JJ language/NN)
  data/NNS
  ./.)


: 

In [7]:
text = "The quick brown fox jumps over the lazy dog."

# Process the text
doc = nlp(text)

# Extract and print noun phrases (NPs)
for chunk in doc.noun_chunks:
    print(f"Chunk: {chunk.text}, Root Text: {chunk.root.text}, Root Dep: {chunk.root.dep_}, Root Head Text: {chunk.root.head.text}")

Chunk: The quick brown fox, Root Text: fox, Root Dep: nsubj, Root Head Text: jumps
Chunk: the lazy dog, Root Text: dog, Root Dep: pobj, Root Head Text: over


### Text Embedding
Text embedding is the process of converting text into numerical vectors that can be used by machine learning models.


##### NLTK

In [8]:
from gensim.models import Word2Vec
from nltk.corpus import brown

# Load the Brown corpus
sentences = brown.sents()

# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

# Example usage: Get the vector for the word 'king'
vector = model.wv['king']
print(vector)

# Find most similar words
similar_words = model.wv.most_similar('king', topn=5)
print(similar_words)

[ 0.12400028  0.13036624  0.10375945  0.06526087 -0.01802809 -0.10003968
  0.11321629  0.31538635 -0.13024324 -0.17027508  0.0204285  -0.18054175
 -0.0557485   0.09536618  0.19326502 -0.06983682 -0.00352094 -0.19092159
 -0.14799626 -0.2168642   0.14356326  0.11956564  0.297104    0.1323912
 -0.08210941 -0.04149396 -0.24066937 -0.09875017  0.01496314  0.13318369
  0.19893812 -0.1008561   0.18726346 -0.22913675 -0.01513384  0.18489641
 -0.04390021  0.06190151 -0.1545404  -0.04611038  0.05350776 -0.19813696
 -0.0088488   0.04358754  0.10623358  0.01677817 -0.06863146  0.01468717
  0.04589223  0.10192411  0.08500671 -0.1329406  -0.1591965  -0.04625204
 -0.00442203 -0.07751082  0.11744513 -0.00382323 -0.03493624  0.06209708
 -0.03585383  0.15543246  0.00891487 -0.03393736 -0.1531956   0.15026395
  0.17832513  0.18629818 -0.29575554  0.27191487  0.10920096  0.06182568
  0.20571443 -0.04702065  0.11151341  0.03914363  0.00972655  0.10746009
  0.03979657 -0.04641774 -0.1034132   0.10573719 -0.

##### Spacy

In [9]:
# Sample text
text = "king"

# Process the text
doc = nlp(text)

# Get the vector for the text
vector = doc.vector
print(vector)

# Finding similarities
text1 = nlp("king")
text2 = nlp("queen")
similarity = text1.similarity(text2)
print(f"Similarity between 'king' and 'queen': {similarity}")

[-1.8038062  -0.57184035 -1.0615969   0.1685325  -0.73480105 -0.07565041
  2.47716     0.34800076 -0.14537281  0.05508129  0.9790832  -1.0621793
 -0.90454197  0.63134384 -0.16790569  0.12966211 -0.7590473  -1.0194731
  1.8497288   0.68561244  0.42840576  1.9528611   0.71009314 -0.49444747
  0.15484339  1.1637604   0.32334328  1.9428613  -0.4038114  -0.40275377
  0.03499427 -0.2542291   0.72335136 -0.19704051 -0.38484254 -2.0028517
 -0.04508871  0.602212    0.4824512   0.08982205 -0.30942625  0.5462789
 -0.3974073   0.15101105 -0.75147295 -1.0158409  -1.3755283   1.5071571
  0.07189959  0.38855618  0.22438756  0.430369    0.81013995 -1.1247113
 -0.4688068  -1.1177742   1.0850023   0.25971973  0.28847817 -0.87373275
 -0.9647601  -0.4774765   0.7437953  -0.5322264   0.4202554   0.10787368
 -0.26062754  0.47333872  0.49753362 -2.0549963   0.1926519   0.16442215
  1.0940242  -0.5006186  -0.28406495 -0.84961617  0.00315695  0.35117477
  0.46972567 -1.1229045  -0.2193472  -0.6019075  -1.93698

  similarity = text1.similarity(text2)
