# text segmentation

In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# test input text
text = "Hello world! This is a test. Let's see how it works."
sen_token = sent_tokenize(text)
word_token = word_tokenize(text)

# print the results for sentence and word tokenization
print("Sentence Tokenization:")
for i, sentences in enumerate(sen_token):
    print(f'sentence {i} : {sentences}')

print("\nWord Tokenization:")
print("word tokens are like", word_token)
for i, words in enumerate(word_token):
    print(f'word {i} : {words}')

Sentence Tokenization:
sentence 0 : Hello world!
sentence 1 : This is a test.
sentence 2 : Let's see how it works.

Word Tokenization:
word tokens are like ['Hello', 'world', '!', 'This', 'is', 'a', 'test', '.', 'Let', "'s", 'see', 'how', 'it', 'works', '.']
word 0 : Hello
word 1 : world
word 2 : !
word 3 : This
word 4 : is
word 5 : a
word 6 : test
word 7 : .
word 8 : Let
word 9 : 's
word 10 : see
word 11 : how
word 12 : it
word 13 : works
word 14 : .


# Lower case conversion and punctuation, url, number and emoji remove

In [14]:
text2 = "Hey!! Visit https://example.com right now 😄🔥 — I 017 got 2 amazing deals at 50% @off!! 🎉💥"

import re 
url_pattern = r'https?://[^\s]+|www\.[^\s]+'
punctuation_pattern = r'[^\w\s]'    
digits_pattern = r'\d+' 
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b'
emoji_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U00002700-\U000027BF]'
hashtag_pattern = r'#\w+'
mention_pattern = r'@\w+'


text2 = re.sub(url_pattern, '', text2)
text2 = re.sub(punctuation_pattern, '', text2)
text2 = re.sub(r'\s+', ' ', text2).strip()
text2 = re.sub(digits_pattern, '', text2)
text2 = re.sub(email_pattern, '', text2)
text2 = re.sub(phone_pattern, '', text2)
text2 = re.sub(emoji_pattern, '', text2)
text2 = re.sub(hashtag_pattern, '', text2)
text2 = re.sub(mention_pattern, '', text2)

lower_text = text2.lower().strip()
print("\nLowercase Text:")
print(lower_text)

# Tokenization
tokens = word_tokenize(lower_text)
print("\nTokenized Text:")
print(tokens)



Lowercase Text:
hey visit right now i  got  amazing deals at  off

Tokenized Text:
['hey', 'visit', 'right', 'now', 'i', 'got', 'amazing', 'deals', 'at', 'off']


# POS TAG

In [19]:
import warnings
warnings.filterwarnings("ignore")
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
#POS TAG SENTENCE
sentence = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
print("\nPOS Tags:")
for word, tag in pos_tags:
    print(f'{word}: {tag}')



POS Tags:
The: DT
quick: JJ
brown: NN
fox: NN
jumps: VBZ
over: IN
the: DT
lazy: JJ
dog: NN
.: .


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sajib\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# stop word remove 

In [None]:
from nltk.corpus import stopwords  
nltk.download('stopwords')

print("applying text", lower_text)
# Remove stop words
stop_words = set(stopwords.words('english'))
words_token = word_tokenize(lower_text)
filtred_word = [words for words in words_token if words not in stop_words]
joint_sentence = " ".join(filtred_word)
print("\nFiltered Words:")
print(filtred_word)
print("\nFiltered Sentence:")
print(joint_sentence)   

applying text hey visit right now i  got  amazing deals at  off

Filtered Words:
['hey', 'visit', 'right', 'got', 'amazing', 'deals']

Filtered Sentence:
hey visit right got amazing deals


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sajib\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# text normalization

In [24]:
text = "Hellooo worlddd! This is a test. Let's seeeee how it works. luv ya!"

def normalize_text(text):
    lower_text = text.lower()
    # Remove multiple letters
    lower_text = re.sub(r'(.)\1+', r'\1', lower_text)  # Replace repeated characters with two occurrences
    # Normalize repeated characters
    lower_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', lower_text)  # Replace three or more repeated characters with two occurrences
    # abbreviations
    abbreviations = {
        "u": "you",
        "r": "are",
        "l8r": "later",
        "b4": "before",
        "gr8": "great",
        "omg": "oh my god",
        "btw": "by the way",
        "idk": "I don't know",
        "imo": "in my opinion",
        "tbh": "to be honest",
        "thx": "thanks",
        "pls": "please",
        "luv": "love"
    }
    for abbr, full in abbreviations.items():
        lower_text = re.sub(r'\b' + re.escape(abbr) + r'\b', full, lower_text)
    return lower_text
normalize_text(text)
    

"helo world! this is a test. let's se how it works. love ya!"

# lemmatization

In [29]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')

text = "The cats are playing with the ball. The dogs are barking at the moon. and filled troubled"
tokens = word_tokenize(text)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmed_words = [stemmer.stem(word) for word in tokens]
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

# join the stemmed and lemmatized words into a single string
stemmed_sentence = " ".join(stemmed_words)
lemmatized_sentence = " ".join(lemmatized_words)
print("\nStemmed Sentence:")
print(stemmed_sentence)
print("\nLemmatized Sentence:")
print(lemmatized_sentence)

print("\nStemmed Words:")
print(stemmed_words)
for word, stemmed in zip(tokens, stemmed_words):
    print(f'{word} -> {stemmed}')

print("\nLemmatized Words:")
for word, lemmatized in zip(tokens, lemmatized_words):
    print(f'{word} -> {lemmatized}')



Stemmed Sentence:
the cat are play with the ball . the dog are bark at the moon . and fill troubl

Lemmatized Sentence:
The cat are playing with the ball . The dog are barking at the moon . and filled troubled

Stemmed Words:
['the', 'cat', 'are', 'play', 'with', 'the', 'ball', '.', 'the', 'dog', 'are', 'bark', 'at', 'the', 'moon', '.', 'and', 'fill', 'troubl']
The -> the
cats -> cat
are -> are
playing -> play
with -> with
the -> the
ball -> ball
. -> .
The -> the
dogs -> dog
are -> are
barking -> bark
at -> at
the -> the
moon -> moon
. -> .
and -> and
filled -> fill
troubled -> troubl

Lemmatized Words:
The -> The
cats -> cat
are -> are
playing -> playing
with -> with
the -> the
ball -> ball
. -> .
The -> The
dogs -> dog
are -> are
barking -> barking
at -> at
the -> the
moon -> moon
. -> .
and -> and
filled -> filled
troubled -> troubled


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sajib\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sajib\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# other lemmatization

In [42]:
# import spacy
# nlp = spacy.load("en_core_web_sm")


In [41]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
# text = "The cats are playing with the ball. The dogs are barking at the moon. and filled troubled"
# doc = nlp(text)
# for token in doc:
#     print(f'{token.text} -> {token.lemma_}')
# lemmatized_sentence = " ".join([token.lemma_ for token in doc])
# print("\nLemmatized Sentence using Spacy:")
# print(lemmatized_sentence)



In [43]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

lemmatizer = WordNetLemmatizer()
text = "The cats are playing with the ball. The dogs are barking at the moon. and filled troubled"

tokens = word_tokenize(text)

lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
print(" ".join(lemmatized))


The cat are playing with the ball . The dog are barking at the moon . and filled troubled
