<a href="https://colab.research.google.com/github/RomanKunal/Deep-Learning/blob/main/NLP_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TEXT PREPROCESSING
1. LowerCasing
2. Remove HTML Tags
3. Remove URLs
4. Remove Punctuation
5. Chatword Removal
6. Spelling Correction
7. Removing stop words
8. Handling Emojis
9. Tokenization

In [1]:
text='The gentle breeze swirled through the trees, rustling the leaves. Birds chirped in harmony as the sun set, casting soft shadows over the quiet, peaceful landscape. Everything felt serene and timeless.'
text.lower()

'the gentle breeze swirled through the trees, rustling the leaves. birds chirped in harmony as the sun set, casting soft shadows over the quiet, peaceful landscape. everything felt serene and timeless.'

In [2]:
#Removing Html Tags
text='<p>The <strong>sun</strong> rises over the horizon, casting a <em>golden</em> glow. <a href="#">Nature</a> awakens, and birds begin to <u>sing</u> in the early morning light.</p>'
import re
def removal_html_tags(text):
  pattern=re.compile('<.*?>')
  return pattern.sub(r'',text)
removal_html_tags(text)

'The sun rises over the horizon, casting a golden glow. Nature awakens, and birds begin to sing in the early morning light.'

In [4]:
#Removing urls
text='For learning programming, visit https://www.w3schools.com for tutorials on HTML, CSS, and JavaScript'
def remove_url(text):
  url=re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r'',text)
remove_url(text)


'For learning programming, visit  for tutorials on HTML, CSS, and JavaScript'

In [6]:
#Removing Punctuations
import string
string.punctuation

def remove_pun(text):
  for pun in string.punctuation:
    text=text.replace(pun,'')
  return text
remove_pun(text)

'For learning programming visit httpswwww3schoolscom for tutorials on HTML CSS and JavaScript'

In [8]:
#ChatWord Removal
chat_words={
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible',
    'ATK':'At The Keyboard',
    'ATM':'At The Moment'}

def remove_chat_words(text):
  new_text=[]
  for word in text.split():
    if word.upper() in chat_words:
      new_text.append(chat_words[word.upper()])
    else:
      new_text.append(word)
  return " ".join(new_text)
text='Asap i need my shoes'
remove_chat_words(text)

'As Soon As Possible i need my shoes'

In [9]:
#Spelling correction
incorrext_text='ceertain new thingggs are theere'
from textblob import TextBlob
def correct_spelling(text):
  return str(TextBlob(text).correct())
correct_spelling(incorrext_text)

'certain new things are there'

In [17]:
#Handling Emojis
!pip install emoji
import emoji
print(emoji.demojize('Python is 👍'))
#

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0
Python is :thumbs_up:


In [23]:
#Stop Words Removal
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "For learning programming, visit https://www.w3schools.com for tutorials on HTML, CSS, and JavaScript."

# Process the text using spaCy
doc = nlp(text)

# Remove stop words
filtered_tokens = []

for token in doc:
    if not token.is_stop and not token.is_punct:
        filtered_tokens.append(token.text)


# Join the tokens back into a sentence
filtered_text = " ".join(filtered_tokens)

print("Original Text:", text)
print("Filtered Text:", filtered_text)


Original Text: For learning programming, visit https://www.w3schools.com for tutorials on HTML, CSS, and JavaScript.
Filtered Text: learning programming visit https://www.w3schools.com tutorials HTML CSS JavaScript


In [21]:
#Tokenization
import spacy
nlp=spacy.load('en_core_web_sm')

doc=nlp(text)
for token in doc:
  print(token)

#

My
name
is
blah
blah
and
for
sure
i
want
idk


In [24]:
#Stemming
import spacy
from nltk.stem import PorterStemmer

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Example text
text = "running runners flies flying studied studying"

# Process text with spaCy
doc = nlp(text)

# Perform stemming using Porter Stemmer
stems = [stemmer.stem(token.text) for token in doc]

# Print results
print("Original Words:", [token.text for token in doc])
print("Stemmed Words:", stems)


Original Words: ['running', 'runners', 'flies', 'flying', 'studied', 'studying']
Stemmed Words: ['run', 'runner', 'fli', 'fli', 'studi', 'studi']


In [25]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
text='My name is Kunal and i do gym hehehe'
cv=CountVectorizer()
bag_of_words=cv.fit_transform([text])
print(cv.vocabulary_)
print(bag_of_words)

{'my': 6, 'name': 7, 'is': 4, 'kunal': 5, 'and': 0, 'do': 1, 'gym': 2, 'hehehe': 3}
  (0, 6)	1
  (0, 7)	1
  (0, 4)	1
  (0, 5)	1
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "Natural Language Processing is fun and exciting.",
    "I love learning about Natural Language Processing.",
    "Text mining and NLP techniques are powerful tools."
]

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents to compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the result to a dense matrix (optional)
dense_matrix = tfidf_matrix.todense()

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Print TF-IDF values for each document
print("TF-IDF Matrix (Dense Representation):")
print(dense_matrix)

# Print words corresponding to the columns of the TF-IDF matrix
print("\nWords in the TF-IDF Matrix:", words)

# Display TF-IDF for each word in each document
for i, doc in enumerate(dense_matrix):
    print(f"\nDocument {i+1} TF-IDF Scores:")
    for j, word in enumerate(words):
        print(f"{word}: {doc[0, j]:.4f}")


TF-IDF Matrix (Dense Representation):
[[0.         0.32992832 0.         0.43381609 0.43381609 0.43381609
  0.32992832 0.         0.         0.         0.32992832 0.
  0.         0.32992832 0.         0.         0.        ]
 [0.45954803 0.         0.         0.         0.         0.
  0.34949812 0.45954803 0.45954803 0.         0.34949812 0.
  0.         0.34949812 0.         0.         0.        ]
 [0.         0.27626457 0.36325471 0.         0.         0.
  0.         0.         0.         0.36325471 0.         0.36325471
  0.36325471 0.         0.36325471 0.36325471 0.36325471]]

Words in the TF-IDF Matrix: ['about' 'and' 'are' 'exciting' 'fun' 'is' 'language' 'learning' 'love'
 'mining' 'natural' 'nlp' 'powerful' 'processing' 'techniques' 'text'
 'tools']

Document 1 TF-IDF Scores:
about: 0.0000
and: 0.3299
are: 0.0000
exciting: 0.4338
fun: 0.4338
is: 0.4338
language: 0.3299
learning: 0.0000
love: 0.0000
mining: 0.0000
natural: 0.3299
nlp: 0.0000
powerful: 0.0000
processing: 0.3299

In [27]:
#Word2Vec
from gensim.models import Word2Vec

# Sample corpus (list of tokenized sentences)
sentences = [
    ['dog', 'barks'],
    ['cat', 'meows'],
    ['dog', 'chases', 'cat'],
    ['cat', 'climbs', 'tree'],
]

# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the vector representation for the word 'dog'
dog_vector = model.wv['dog']
print("Vector for 'dog':", dog_vector)

# Find similar words to 'dog'
similar_words = model.wv.most_similar('dog', topn=3)
print("Words similar to 'dog':", similar_words)


Vector for 'dog': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-03
  3.58

In [28]:
#Part of speech tagging
import spacy

# Load the SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Process the text using SpaCy
doc = nlp(text)

# Iterate over the tokens and print the word along with its POS tag
for token in doc:
    print(f"Word: {token.text}, POS: {token.pos_}")


Word: The, POS: DET
Word: quick, POS: ADJ
Word: brown, POS: ADJ
Word: fox, POS: NOUN
Word: jumps, POS: VERB
Word: over, POS: ADP
Word: the, POS: DET
Word: lazy, POS: ADJ
Word: dog, POS: NOUN
Word: ., POS: PUNCT


In [29]:
#NER
import spacy

# Load the pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Apple Inc. is looking to acquire a startup in Berlin for $2 billion. Tim Cook announced the plans last week."

# Process the text
doc = nlp(text)

# Print the entities found in the text
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: Apple Inc., Label: ORG
Entity: Berlin, Label: GPE
Entity: $2 billion, Label: MONEY
Entity: Tim Cook, Label: PERSON
Entity: last week, Label: DATE
