# Natural Language processing - Example

In [1]:
# import the importent libraries 
import nltk
import string
import re

# text_lowercase :

In [2]:
# We lowercase the text to reduce the size of the vocabulary of our text data.
def text_lowercase(text):
    return text.lower()

input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
text_lowercase(input_str)

"hey, did you know that the summer break is coming? amazing right !! it's only 5 more days !!"

# remove_numbers : 

In [3]:
# We can either remove numbers or convert the numbers into their textual representations.
# We can use regular expressions to remove the numbers.
def remove_numbers(text):
    #The r means that the string is to be treated as a raw string, which means all escape codes will be ignored.
    # \D+ is checking if the whole string is a non-digit expression.
    result = re.sub(r'\d+', '', text)
    return result

input_str = "There are 3 balls in this bag, and 12 in the other one."
remove_numbers(input_str)

'There are  balls in this bag, and  in the other one.'

# Convert numbers in words :

In [13]:
# We can also convert the numbers into words. This can be done by using the inflect library.

import inflect
p= inflect.engine()


def convert_num_in_words(text):
    
    temp_string = text.split()
    new_string = []
    
    for word in temp_string:
        
        if word.isdigit():
            
            temp = p.number_to_words(word) # if word is a number convert it into word
            new_string.append(temp)
            
        else:
            new_string.append(word) # if word is a word
            
    temp_string = ' '.join(new_string)
    return temp_string

input_str = 'There are 3 balls in this bag, and 12 in the other one.'
convert_num_in_words(input_str)

        

'There are three balls in this bag, and twelve in the other one.'

# Remove punctuation :

In [14]:
# We remove punctuations so that we don’t have different forms of the same word. If we don’t remove the punctuation,
# then been. been, been! will be treated separately.

def remove_punctuation(text):
    translator = str.maketrans('','',string.punctuation)
    return text.translate(translator)

input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
remove_punctuation(input_str)

'Hey did you know that the summer break is coming Amazing right  Its only 5 more days '

# Remove whitespaces :

In [16]:
def remove_whitespace(text):
    return " ".join(text.split())

input_str = "   we    don't need   the     given     questions"
remove_whitespace(input_str)

"we don't need the given questions"

# Remove default stopwords :

In [None]:
import nltk
nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    stop_words = set(stopwords.words("English"))
    word_tokens  = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

example_text = "This is a sample sentence and we are going to remove the stopwords from this."
remove_stopwords(example_text)
    

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Remove stemming :  

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tockenize
stemmer = PorterStemmer()

# stem words in the list of tokenised words 
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text)

# Lemmatization :
     lemmatization also converts a word to its root form. The only difference is that lemmatization ensures that the root
     word belongs to the language. We will get valid words if we use lemmatization.

In [None]:
import nltk
nltk.download()

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer  = WordNetLemmatizer()

def lemmatize_word(text):
    
    word_tokens = word_tokenize(text) 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas
text = 'data science uses scientific methods algorithms and many types of processes'
lemmatize_word(text)


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
