In [None]:
# NLP_PRACTICAL_5_TEXT_PROCESSING_TASKS
! pip install nltk

In [None]:
import nltk

In [None]:
from nltk.corpus import gutenberg

In [None]:
nltk.download('all')

In [None]:
# Check whether the gutenberg corpus is uploaded
nltk.corpus.gutenberg

In [None]:
# Determine the contents of gutenberg copora using fileids()
nltk.corpus.gutenberg.fileids()

In [None]:
# To know how many txt files in gutenberg
len(nltk.corpus.gutenberg.fileids())

### Open the three files for demonstration of text processing tasks and read the text using readLines()

In [None]:
# Create the objects to access the contents of txt files present in gutenberg
blake = gutenberg.open('blake-poems.txt')
bible = gutenberg.open('bible-kjv.txt')
hamlet = gutenberg.open('shakespeare-hamlet.txt')


In [None]:
blake=blake.readlines()
bible=bible.readlines()
hamlet=hamlet.readlines()

In [None]:
# Know the no. of lines in each
len(blake), len(bible), len(hamlet)

### We consider 'blake' for learning the text processing.

In [None]:
blake

In [None]:
# Check the no.of lines in blake
len(blake)

### Select one of the text say blake, which consists of poems, songs...etc

### Task 1: Basic preprocessing - remove all the empty newlines in the corpus and strip any newline characters from other lines using strip()

In [None]:
# To check for the blank lines in the text
for text in blake:
  print(text)
# Observe that len(blake) originally has no. of lines 1441 which includes balnk lines as well as empty strings

### Observe the large no. of blank lines in the text. Use strip() to remove them

### strip('\n') method is used to remove any newline characters (\n) from the beginning and end of the string.
### text.strip('\n') results in a new list where each element is a line from blake but without trailing newline characters.

In [None]:
for text in blake:
  print(text.strip('\n'))

### filter(None, ...) funciton applies the filter to the list created by the list comprehension.
### Argument None used in filter indicates it will remove any elements that evaluate to False in a boolean context. In Python, empty strings ('') are considered to be False.
### Therefore, this step effectively removes any blank lines (empty strings) from the list.

### This is typically done in text processing to clean up the data and remove unnecessary whitespace or blank lines. By removing blank lines, it focuses the subsequent analysis on the actual content of the text. This is required for frequency analysis, tokenization, and other NLP operations.

In [None]:
blake = list(filter(None, [text.strip('\n') for text in blake]))

In [None]:
# Observe that blank lines have been removed but empty strings remain
blake[0:20]

In [None]:
print(blake)

In [None]:
# Before strting with the next task let us know the length of each sentence(no. of characters in it)
for sentence in blake:
  print(sentence,len(sentence))

In [None]:
# Check length now after removing blank and \n -
# It has been reduced
len(blake)
# Observe it is less than the original length of 1441

In [None]:
# Try with other texts: bible, hamlet
hamlet = list(filter(None, [text.strip('\n') for text in hamlet]))
bible = list(filter(None, [text.strip('\n') for text in bible]))

In [None]:
len(blake),len(hamlet),len(bible)

### Task 2: Basic frequency analysis on the corpus now.
### Task 2.1: Computing the length of each sentence and then visualize this using a histogram.
### Task 2.2: Visualize the overall distribution of typical sentence or line lengths across the file blake/hamlet/bible.


In [None]:
# (2.1) Find the length of each sentence in the text - blake and make a list of all lengths
line_lengths_blake = [len(sentence) for sentence in blake]
print(line_lengths_blake)
# Observe that most of the sentences have around 17-42 characters and heighest no. of characters is from 2 - 8 .

In [None]:
# To visualize the overall distribution of typical sentence or line lengths across the text blake/hamlet/bible.
# import matplotlib.pyplot This line imports the `pyplot` module from the `matplotlib` library and assigns it the alias `plt`.
# This module provides functions for creating various types of plots, including histograms.
import matplotlib.pyplot as plt

In [None]:
# Draw the histogram to visualize the distribution of words.
plt.hist(line_lengths_blake)

### line_lengths_blake is a variable which contains a list or array of numbers, where each number represents the length of a line in the 'blake' text.
### plt.hist() function from the matplotlib.pyplot module is used to create a histogram. It takes the data (line_lengths_blake in this case) as input and automatically bins the data into different intervals (or "bins").
### It shows you how many lines in the "blake" text are of different lengths. The taller bars represent the line lengths that occur more frequently and shorter bars for less frequent line lengths.
### The histogram plot has
### X-axis: Represents the line lengths (or binned intervals of line lengths).
### Y-axis: Represents the frequency or count of lines falling within each bin.
### Bars: The height of each bar indicates the number of lines with lengths falling within that particular bin.

In [None]:
h_blake = plt.hist(line_lengths_blake,color='green')
# color='green'` sets the color of the histogram bars to green.

In [None]:
# Text Analysis for bible
line_lengths_bible = [len(sentence) for sentence in bible]
print(line_lengths_bible)
h_bible = plt.hist(line_lengths_bible,color='yellow')

### Task 2.3: Tokenize each sentence by splitting it into words and compute the length of each sentence to get the total words per sentence.

In [None]:
for sentence in blake:
  print(sentence.split(),len(sentence.split()))

In [None]:
# Tokenize all the senetnces
lines_blake = [sentence.split() for sentence in blake]

In [None]:
type(lines_blake)

In [None]:
lines_blake[:5]

In [None]:
lines_blake[0],lines_blake[1],lines_blake[2],lines_blake[3], lines_blake[4]

In [None]:
# Draw histogram again
h_blake_new= plt.hist(lines_blake[0:1])

In [None]:
tokens_per_line_blake = [len(sentence.split()) for sentence in blake]
print(tokens_per_line_blake)

In [None]:
type(tokens_per_line_blake)

### Based on the 2nd visualization below, we conclude that most sentences in blake have roughly 3 - 8 words, or tokens.

In [None]:
# Draw histogram again
h_blake_new= plt.hist(tokens_per_line_blake, color='orange')

In [None]:
# Tokenize all the senetnces
tokens_blake = [words.split() for words in blake]

In [None]:
type(tokens_blake)

In [None]:
tokens_blake[:5]

### Task 3: To determine the most common words in the blake corpus. We already have our sentences tokenized into words (lists of words).
### Task 3.1: The first step involves flattening this big list of lists (each list is a tokenized sentence of words) into one big list of words.
### Observe that empty strings are eliminated automatically

In [None]:
for sentence in tokens_blake:
  for word in sentence:
    print(word)

In [None]:
# Extract all the words and put it in a list
words_blake = [word for sentence in tokens_blake for word in sentence]

In [None]:
type(words_blake)

In [None]:
len(words_blake)

In [None]:
print(words_blake[:10])

### Task 3.2: Find the most frequent words invoke the counter from the collections - module

In [None]:
# To determine the most frequent words
from collections import Counter

In [None]:
# convert all the owrds into lower case
words_blake = [word.lower() for word in words_blake]

In [None]:
# A dictionary with jeys as words and frequencies as values is created when we use Counter()
c_blake = Counter(words_blake)

In [None]:
# Dictionary is sorted by default as per the frequencies
print(c_blake)

In [None]:
c_blake.most_common(10)

###Task 4: To remove unwanted symbols and special characters in some of the words, use "re.sub"

In [None]:
# Import the regular expressopn modeule
import re

### re.sub() function is used for replacing occurrences of a pattern in a string with another string.
### It has three main arguments:
#### (1) pattern: The regular expression pattern to search for.
#### (2) replacement: The string to replace the matched pattern with.
##### (3) string: The input string on which the substitution is performed.
### where
### r'[^A-Za-z]' is the regular expression pattern.
#### r'' denotes a raw string, which is commonly used for regular expressions to avoid escaping special characters.
#### [^A-Za-z] is the actual pattern.
#### [] defines a character set.
#### ^ inside the character set means negation, i.e., match any character that is not in the set.
#### A-Za-z specifies all uppercase and lowercase letters, which helps to match any character that is not an alphabet (A-Z or a-z).
#### "": is the replacement string.
### In this case, it is an empty string. This means that any matched character (any non-alphabet) will be replaced with nothing, effectively removing it from the string.
#### word: This is the input string on which the substitution is applied. It represents the word that you want to process.


In [None]:
# Example
word="@ B. K. Birla College!!"
re.sub(r'[^A-Za-z]', "", word)
# It has removed blank space, ., @, "", and !!

In [None]:
for word in words_blake:
  print(re.sub(r'[^A-Za-z]', "", word))

#### Actual removal of the blank spaces, and special charcters.......

In [None]:
words_blake = list(filter(None, [re.sub(r'[^A-Za-z]', "", word) for word in words_blake]))

In [None]:
print(words_blake)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords

In [None]:
words_blake = [word.lower() for word in words_blake if word.lower() not in stopwords]

In [None]:
words_blake[:20]

In [None]:
c_blake = Counter(words_blake)
c_blake.most_common(10)

###Task 5: TEXT WRANGLING


In [None]:
# TASK 5.1: WEB SCRAPPING
# leverage requests and retrieve the contents of this web page in Python.This is known as web scraping

In [None]:
import requests
data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.content
print(content[1000:2200])

In [None]:
content[0:1000]

###it is extremely difficult to decipher the actual textual content in the web page, due to all the unnecessary HTML tags.
### We need to remove those tags. The BeautifulSoup library provides functions that help us remove these unnecessary tags with ease

In [None]:
# Removign HTML Tags. Import re
import re

In [None]:
# import BeautifulSoup from the bs4 library
from bs4 import BeautifulSoup

### Define the function for removing html tags

In [None]:
def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

In [None]:
clean_content = strip_html_tags(content)
print(clean_content[0:500])


### Compare this output with the raw web page content and see that we have successfully removed the unnecessary HTML tags. We now have a clean body of text which is easier to interpret and understand.

In [None]:
# TASK 5.2: TOKENIZATION : demonstartion of word and sentence tokenization for
# (i) inbuilt text (ii) sample text (iii) text from some other language

In [None]:
import nltk
from nltk.corpus import gutenberg
from pprint import pprint
import numpy as np

In [None]:
# (i) loading text corpora
alice = gutenberg.raw(fileids='carroll-alice.txt')

In [None]:
# To find total characters alice
print(len(alice))

In [None]:
# get first 100 characters
print(alice[0:100])

In [None]:
# get defualt sentence tokenizer
sent_df=nltk.sent_tokenize()

In [None]:
type(sent_df)

In [None]:
sent_df

In [None]:
# extract senetnces from alice
alice_sentences=sent_df(text=alice)

In [None]:
alice_sentences[:50]

In [None]:
print('Total sentences in alice text :',len(alice_sentences))
print('Sample alice text sentences :')
pprint(alice_sentences[0:5])


In [None]:
# (ii) Take some sample text
sample_text='We will discuss briefly about the basic syntax,structure and design philosphies. There is defined hierrachical syntax for Python code which you should remember when writing code! Python is really powerful programming language !'

In [None]:
type(sample_text)

In [None]:
# tokenizing with default toenizer 'sent_df'
sample_sentences=sent_df(text=sample_text)

In [None]:
print('Total sentences in sample text :',len(sample_sentences))
print('Sample text sentences :')
pprint(sample_sentences)

In [None]:
# (iii) Processing some other langauge text say German
from nltk.corpus import europarl_raw

In [None]:
nltk.download('all')

In [None]:
nltk.corpus.europarl_raw

In [None]:
german_text=europarl_raw.german.raw(fileids='ep-00-01-17.de')

In [None]:
print(len(german_text))

In [None]:
print(german_text[0:100])

In [None]:
german_sentences_def=sent_df(text=german_text,language='german')

In [None]:
german_sentences_def

In [None]:
# loading the german text tokenizer
german_tokenizer=nltk.data.load(resource_url='tokenizers/punkt/german.pickle')

In [None]:
# To verify the invoking of pretrained tokenizer
print (type(german_tokenizer))

In [None]:
# Apply the pretrained tokenizer on the german_text
german_sentences=german_tokenizer.tokenize(german_text)

In [None]:
german_sentences

In [None]:
# To verify whether the sentences obtained from the default tokenizer 'sent_df'
# are same as the sentences obtained from the pre-trained tokenizer
print(german_sentences_def==german_sentences)

In [None]:
for sent in german_sentences[0:10]:
  print(sent)

### Verify for the sample text generated above

In [None]:
# tokenizing sample text with default toenizer 'sent_df'
sample_sentences_def=sent_df(text=sample_text)

In [None]:
# tokenizing sample text with pre-trained tokenizer :PunktSentenceTokenizer()
punkt_pretrained=nltk.tokenize.PunktSentenceTokenizer()

In [None]:
sample_sentences=punkt_pretrained.tokenize(sample_text)

In [None]:
# To verify
print(sample_sentences_def==sample_sentences)

In [None]:
pprint(sample_sentences)

#### Task 5.3 Word Tokenization verfication for both default and pre-trained Word Tokenizer(WT)

In [None]:
sentence="The brown fox wasn't quick and he couldn't win the race"

In [None]:
# Invoke default word tokenizer (wt)
default_wt=nltk.word_tokenize

In [None]:
# Create words through default wt
words_df=default_wt(sentence)
print(words_df)

### Using Pre-trained Tokenizer
#### TreebankWordTokenizer is a specific class within NLTK that provides a word tokenizer trained on the Penn Treebank corpus. The Penn Treebank is a large annotated corpus of English text commonly used in natural language processing research.
#### Here we can define
#### treebank_wt, a variable that will hold the instance of the TreebankWordTokenizer class and use this variable to tokenize text using the Treebank tokenizer.


In [None]:
# Use the pre- trained tokenizer say TreebankTokenizer
treebank_wt=nltk.TreebankWordTokenizer()

In [None]:
# Create words using treebank_wt
words_treebank=treebank_wt.tokenize(sentence)
print(words_treebank)

In [None]:
# To verify the words of default and pretrained tokenizer
print(words_df==words_treebank)

In [None]:
# To remove the white spaces say space between could and n't
whitespaces_wt=nltk.WhitespaceTokenizer()

In [None]:
# Create words using whitespaces_wt
words_white=whitespaces_wt.tokenize(sentence)
print(words_white)

#### Task 6: Text Normalization

In [None]:
import nltk
nltk.download('punkt')
import re
import string
from pprint import pprint

In [None]:
# Creating our own corpus
my_corpus=["The brown fox wasn't quick and he couldn't win the race.",
"Hey it was a great cricket match @yesterday!!",
"I just bought a @new mobile for me at $1000.",
"Python NLP is really ****amazing*****!!@@@"]

In [None]:
type(my_corpus)

In [None]:
my_corpus[0], my_corpus[1], my_corpus[2], my_corpus[3]

In [None]:
# Tokenize the my_corpus
def tokenize_text(text):
  sentences=nltk.sent_tokenize(text)
  word_tokens=[nltk.word_tokenize(sentence) for sentence in sentences]
  return word_tokens

In [None]:
# Get the token list from the above def
token_list=[tokenize_text(text) for text in my_corpus]
print(token_list)

### Take the example of you choice and demonstrate the removal Special  Characters
### An important task in text normalization involves removing unnecessary and special characters.
### These may be special symbols or even punctuation that occurs in sentences.
### This step is often performed before or after tokenization. The main reason for doing so is because often punctuation or special characters do not have much significance when we analyze the text and utilize it for extracting features or information based on NLP and ML. It is possible to implement both types of special characters removal, before and after tokenization.

In [None]:
# Task 5.3: Removing special characters for any given sentence
def remove_special_characters(text):# remove_digits = False is optional
  pattern = r'[^a-zA-z0-9\s]'
  #if not remove_digits else r'[^a-zA-z\s]'
  text = re.sub(pattern,'', text)
  return text

In [None]:
remove_special_characters("I just bought a @new mobile for me at $1000 dollars\n"
"Python NLP is really ****amazing*****!!@@@")

In [None]:
remove_special_characters('Mary had a @@@ little lamb !!#$%&*')

In [None]:
# Removing special characters before and after tokenization
import re
import string
from pprint import pprint

In [None]:
# To remove special characters  after  tokenization use re.compile, re.escape and format
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens

### my_corpus=["The brown fox wasn't quick and he couldn't win the race.","Hey it was a great cricket match @yesterday!!", "I just bought a @new mobile for me at $1000.", "Python NLP is really ****amazing*****!!@@@"]

In [None]:
filtered_list_1=[list(filter(None,[list(remove_characters_after_tokenization(tokens))for tokens in sentence_tokens]))for sentence_tokens in token_list]
#token_list=[tokenize_text(text) for text in my_corpus]
print(filtered_list_1)

In [None]:
# Removing special characters before tokenization
def remove_characters_before_tokenization(sentence,keep_apostrophes=False):
  sentence = sentence.strip()
  if keep_apostrophes:
    PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
    filtered_sentence = re.sub(PATTERN, r'', sentence)
  else:
    PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
  filtered_sentence = re.sub(PATTERN, r'', sentence)
  return filtered_sentence

In [None]:
filtered_list_2 = [remove_characters_before_tokenization(sentence)
for sentence in my_corpus]
pprint(filtered_list_2)

In [None]:
cleaned_my_corpus=[remove_characters_before_tokenization(sentence,keep_apostrophes=True)
for sentence in my_corpus]
pprint(cleaned_my_corpus)

In [None]:
!pip install contractions

In [None]:
import contractions
import re

In [None]:
contraction_mapping=contractions.contractions_dict
print(contraction_mapping)

In [None]:
# Replacing contractions with expanded form
def expand_contractions(sentence, contraction_mapping):
  contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
  expanded_sentence = contractions_pattern.sub(expand_match, sentence)
  return expanded_sentence

# Matching the contraction in the sentence and reaplcing it with its expanded form
def expand_match(contraction):
  match = contraction.group(0)
  #first_char = match[0]
  expanded_contraction = contraction_mapping.get(match)
  if expanded_contraction:
    return expanded_contraction
  else:
    return contraction_mapping.get(match.lower())
    #expanded_contraction = first_char + expanded_contraction[1:]
contraction_mapping = contractions.contractions_dict  # Assuming 'contractions' is already imported
sentence = ""
# Define contractions_pattern outside the function
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
expanded_sentence = contractions_pattern.sub(expand_match, sentence)
pprint(expanded_sentence)

In [None]:
expanded_my_corpus=[expand_contractions(sentence,contraction_mapping) for sentence in cleaned_my_corpus]
sentences = str(cleaned_my_corpus)
for contraction, expanded_form in contraction_mapping.items():
    sentences = sentences.replace(contraction, expanded_form)
# Print the expanded sentence
pprint(sentences)

In [None]:
# alternate definition of expand_match()
def expand_match(contraction):
    match = contraction.group(0)
    expanded_contraction = contraction_mapping.get(match)
    if expanded_contraction:
        return expanded_contraction
    else:
        return contraction_mapping.get(match.lower())
    return expanded_contraction

In [None]:
sentence="I wasn't getting the right code."
for contraction, expanded_form in contraction_mapping.items():
    sentence = sentence.replace(contraction, expanded_form)
# Print the expanded sentence
print(sentence)

In [None]:
expanded_my_corpus=[expand_contractions(sentence,contraction_mapping) for sentence in cleaned_my_corpus]
print(expanded_my_corpus)

# Case Conversion

In [None]:
my_corpus[0]

In [None]:
print(my_corpus[0].lower())

In [None]:
print(my_corpus[0].upper())

# Removing Stopwords Stopwords   
# Stopwords are words that have little or no significance.
# They are usually removed from text during processing so as to retain words having maximum significance and context.
# Stopwords are usually words that end up occurring the most if you aggregated any corpus of text based on singular tokens and checked their frequencies.
# Words like  a, the ,  me , and so on are stopwords.
# There is no universal or exhaustive list of stopwords.Each domain or language may have its own set of stopwords.

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.tokenize.toktok import ToktokTokenizer

In [None]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
print(stopword_list[:10])

In [None]:
def remove_stopwords(tokens):
  stopword_list = nltk.corpus.stopwords.words('english')
  filtered_tokens = [token for token in tokens if token not in stopword_list]
  return filtered_tokens
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text=' '.join(filtered_tokens)
  return filtered_text

In [None]:
remove_stopwords("The, and, if are stopwords, computer is not")

# Correcting Words

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

In [None]:
from nltk.corpus import wordnet

In [None]:
wordnet.fileids()

In [None]:
old_word='finalllyyy'
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
step = 1
while True:
  # check for semantically correct word
  if wordnet.synsets(old_word):
    print('Final correct word: ',old_word)
  break
# remove one repeated character
  new_word = repeat_pattern.sub(match_substitution,old_word)
  if new_word != old_word:
    print('Step: {} Word: {}'.format(step, new_word))
    step=step+1
    # update step and update old word to last substituted state
    old_word = new_word
    continue
  else:
    print('Final word: ', new_word)
  break