<a href="https://colab.research.google.com/github/Syed-Mansoor/nlp/blob/main/Text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-dataset-of-50k-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F134715%2F320111%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240806%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240806T095320Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1aa126fab63b0a4f097a53f57f5bc832c49d46f45d188fd951e9e9409184baec85423c1973ea7556bc2115b5bd7b25d696976cae9d1af94b112b3120a79fc7a4030b8ab8609c8e93ae390ceb0b443bd8061048efb3a1e872363bcf19ead673297f7e7f4dced207017a22238bbc4c8a4ba9f24db47c1e1e2d46e2bcfe55843019e153d91cd95f5596485039aae9db375df27b82575187ccf69cf17c850ceb99ab8b71218cb325c5142961949b25ab648975b4d9349dc8894347f807d37f196455bea62fc9352d18e2d7b553ba17322a74e02b90839af7ca2136d7fae08661f8922b9c2d25c2d48b2f13370965e349e2f89ba1c7a83d9a0420157670de92c37539'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading imdb-dataset-of-50k-movie-reviews, 26962657 bytes compressed
Downloaded and uncompressed: imdb-dataset-of-50k-movie-reviews
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [3]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Lower Case

In [4]:
df['review'] = df['review'].str.lower()

# Remove HTML Tags

In [5]:
import re

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Example usage
html_content = "<p>This is a <b>sample</b> text with <a href='#'>HTML tags</a>.</p>"
clean_text = remove_html_tags(html_content)
print(clean_text)


This is a sample text with HTML tags.


In [6]:
df['review'] = df['review'].apply(remove_html_tags)

In [7]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

# Remove URL's

In [8]:
import re

def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    return re.sub(url_pattern, '', text)

# Example usage
text_with_urls = "Check out this link: https://example.com and also visit http://example.org."
clean_text = remove_urls(text_with_urls)
print(clean_text)


Check out this link:  and also visit 


In [9]:
df['review']  = df['review'].apply(remove_urls)

In [10]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

# Remove Punctuation

In [11]:
import string,time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
exclude = string.punctuation

In [13]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

text = 'string. with. punctuation ?'
print(remove_punc(text))

string with punctuation 


In [14]:
df['review'] = df['review'].apply(remove_punc)

- . Second Way to Remove Punctuation

In [15]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',exclude))

In [16]:
start = time.time()
remove_punc1(text)
time = time.time() - start
print(f'{time :}')

0.00012969970703125


# Chat Word Treatment

In [17]:
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing"
}


In [18]:
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LMAO': 

In [19]:
def chat_conversation(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)

In [20]:
chat_conversation('BFF he is the best')

'Best friends forever he is the best'

In [21]:
df['review'] = df['review'].apply(chat_conversation)

# Spelling Correction

In [22]:
from textblob import TextBlob

In [23]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the same maner'

textblb = TextBlob(incorrect_text)
textblb.correct().string

'certain conditions during several generations are modified in the same manner'

In [24]:
from textblob import TextBlob

def correct_spelling_in_values(dataset):
    textblb = TextBlob(dataset)
    return textblb.correct().string

correct_spelling_in_values('gooog nighht')

'good night'

In [25]:
# df['review']= df['review'].apply(correct_spelling_in_values)

In [26]:
df['review'][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

# Removing Stop Words

In [30]:
from nltk.corpus import stopwords

In [33]:
import nltk
nltk.download('stopwords')

stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [35]:
remove_stopwords('probably my all-time favourite movew.')

'probably  all-time favourite movew.'

In [36]:
df['review'] = df['review'].apply(remove_stopwords)

# Remove Emojis with a meaning

In [36]:
import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

text = "Hello 😊, how are you? 🌟"
clean_text = remove_emojis(text)
print(clean_text)  # Output: "Hello , how are you? "


# Remove emojis

In [37]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

text = "Hello 😊, how are you? 🌟"
clean_text = remove_emojis(text)
print(clean_text)  # Output: "Hello , how are you? "


Hello , how are you? 


# Tokenization

## 1. Using the split function

In [38]:
# word tokenization
sent1 = 'I am going to kahmir'
sent1.split()

['I', 'am', 'going', 'to', 'kahmir']

In [39]:
# sentence tokenization
sent2 = 'I live in kashmir.This place is heaven'
sent2.split(',')

['I live in kashmir.This place is heaven']

In [40]:
# problem with split function
sent3 = 'I am going to gulmarg'
sent3.split()

['I', 'am', 'going', 'to', 'gulmarg']

In [42]:
sent4 = "where do you think i should go? I have 3 day holiday"
sent4.split('.')

['where do you think i should go? I have 3 day holiday']

## 2. Regular Expression

In [43]:
from ssl import SSL_ERROR_WANT_X509_LOOKUP
import re
sent5 = 'I am going to tosamaidan'
tokens = re.findall("[\w']+",sent5)
tokens

['I', 'am', 'going', 'to', 'tosamaidan']

## 3. NLTK

In [46]:
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [47]:
sent1 = 'I am going to delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

In [48]:
sent2 = "Sentence: Good morning! ☀️ Let's grab some coffee ☕ and get started with our work 💼."
sent_tokenize(sent2)

['Sentence: Good morning!',
 "☀️ Let's grab some coffee ☕ and get started with our work 💼."]

# some problem are there in nltk

## 4. Spacy

In [49]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [50]:
doc1 = nlp(sent1)
doc2 = nlp(sent2)
doc3 = nlp(sent3)
doc4 = nlp(sent4)
doc5 = nlp(sent5)

In [52]:
for token in doc2:
    print(token)

Sentence
:
Good
morning
!
☀
️
Let
's
grab
some
coffee
☕
and
get
started
with
our
work
💼
.


# Stemming


### in grammer inflection is the modification of as word to express different grammetical categories such as tense,case,voice,aspect,person,number,gender,and mood

### Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the language

In [55]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [56]:
text = "The runners were running through the forest, chasing after their dreams. Each step they took brought them closer to their goals, yet the journey was filled with challenges. They struggled with their fears and overcame obstacles, knowing that perseverance would lead to success. The beauty of nature inspired them, and the chirping of birds filled their hearts with hope."

In [57]:
print(text)

The runners were running through the forest, chasing after their dreams. Each step they took brought them closer to their goals, yet the journey was filled with challenges. They struggled with their fears and overcame obstacles, knowing that perseverance would lead to success. The beauty of nature inspired them, and the chirping of birds filled their hearts with hope.


In [58]:
stem_words(text)

'the runner were run through the forest, chase after their dreams. each step they took brought them closer to their goals, yet the journey wa fill with challenges. they struggl with their fear and overcam obstacles, know that persever would lead to success. the beauti of natur inspir them, and the chirp of bird fill their heart with hope.'

# Lemitization

### lemitization: unlike stemming reduces the inflected words properly ensuring that the root word belongs to the language.In lemmatization root word is called lemma.A lemma (plural lemmas or lemmata) is the canonical form,dictionary form,or citation form of a set of words.

In [59]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Function to lemmatize text
def lemmatize_text(paragraph):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(paragraph)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Example paragraph
paragraph = ("The runners were running through the forest, chasing after their dreams. "
             "Each step they took brought them closer to their goals, yet the journey was filled "
             "with challenges. They struggled with their fears and overcame obstacles, knowing "
             "that perseverance would lead to success. The beauty of nature inspired them, and "
             "the chirping of birds filled their hearts with hope.")

# Lemmatize the paragraph
lemmatized_paragraph = lemmatize_text(paragraph)
print(lemmatized_paragraph)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


The runner be run through the forest , chase after their dream . Each step they take brought them closer to their goal , yet the journey be fill with challenge . They struggle with their fear and overcame obstacle , know that perseverance would lead to success . The beauty of nature inspire them , and the chirp of bird fill their heart with hope .


In [63]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lematizer = WordNetLemmatizer()

sentence = 'He was running and eating ata same time.He has bad habits of swimming after playing long hours in the sun'

punctuation = '?:!.,;'
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuation:
        sentence_words.remove(word)
sentence_words
print("{0:20}{1:20}".format('Word','Lemma'))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_lematizer.lemmatize(word,pos = 'v')))

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
ata                 ata                 
same                same                
time.He             time.He             
has                 have                
bad                 bad                 
habits              habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
sun                 sun                 
