In [1]:
from urllib import request 
import string
import re

import pandas as pd

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Tasks to do here :
#   - Lower case the text data in pandas dataframe.
#   - Remove stop words.
#   - Remove punctuations.

In [3]:
text = ['This is introduction to NLP','It is likely to be useful, to people ',
        'Machine learning is the new electrcity',
        'There would be less hype around AI and more action going forward',
        'python is the best tool!','R is good langauage',
        'I like this book',
        'I want more books like this']

In [4]:
data = pd.DataFrame({'tweet': text})
print(data)

                                               tweet
0                        This is introduction to NLP
1              It is likely to be useful, to people 
2             Machine learning is the new electrcity
3  There would be less hype around AI and more ac...
4                           python is the best tool!
5                                R is good langauage
6                                   I like this book
7                        I want more books like this


In [5]:
# Lower case the tweets we have:

data['tweet'] = data['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
print(data)

                                               tweet
0                        this is introduction to nlp
1               it is likely to be useful, to people
2             machine learning is the new electrcity
3  there would be less hype around ai and more ac...
4                           python is the best tool!
5                                r is good langauage
6                                   i like this book
7                        i want more books like this


In [6]:
# remove punctuations: 
sentence = "John has been selected for the trial phase this time. Congrats!!"
for c in string.punctuation:
    sentence = sentence.replace(c, '')
print(sentence)

John has been selected for the trial phase this time Congrats


In [7]:
sentence_new = re.sub(r'[^\w\s]', '', sentence)
sentence_new

'John has been selected for the trial phase this time Congrats'

In [8]:
data['tweet'] = data['tweet'].str.replace('[^\w\s]',"")
print(data)

                                               tweet
0                        this is introduction to nlp
1                it is likely to be useful to people
2             machine learning is the new electrcity
3  there would be less hype around ai and more ac...
4                            python is the best tool
5                                r is good langauage
6                                   i like this book
7                        i want more books like this


In [9]:
# Remove stop words from data :
stop = stopwords.words('english')
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(data)

                                            tweet
0                                introduction nlp
1                            likely useful people
2                 machine learning new electrcity
3  would less hype around ai action going forward
4                                python best tool
5                                r good langauage
6                                       like book
7                                 want books like


## Text Search Using Regular Expressions

Regular expressions are a very useful means of searching for a particular
type of design or wordset from a given text. A regular expression (RE)
specifies a set of strings that match it.

In [10]:
sentence = 'I attended a Very nice lecture last year'
words = ['very', 'nice', 'lecture', 'day', 'moon']
expression = "|".join(words)
re.findall(expression, sentence.lower(), re.M)

['very', 'nice', 'lecture']

## Text to List
You can read a text file and convert it into a list of words or list of sentences,
according to your needs.

In [11]:
PATH = "data/data.txt"

with open(PATH) as f :
    words = f.read().split()
print(words)

['Are', 'you', 'sure', 'moving', 'ahead', 'on', 'this', 'route', 'is', 'the', 'right', 'thing', 'for', 'you', '?']


In [12]:
# Method-2 : Whole text as single element of the list
f = open(PATH)
words_ =  f.readlines()
print(words_)

['Are you sure moving ahead on this route is the right thing for you ?']


replacing one word with another, removing or adding some specific type of words, etc.

In [13]:
sentence = 'John has been selected for the trail phase this time. Congrats!!'
sentence = sentence.lower()

In [14]:
# remove punctuations
# we can also do that simply doing as :
# >>> sentence.replace('!', '')

for c in string.punctuation :
    sentence = sentence.replace(c, '')
sentence

'john has been selected for the trail phase this time congrats'

In [15]:
# defining the positive and negative words explicitly
positive_words = ['awesome', 'good', 'nice', 'super', 'fun', 'delightful','congrats']
negative_words = ['awful', 'lame', 'horrible', 'bad']

words = sentence.split()
result = set(words) - set(positive_words)
print(result)

{'has', 'this', 'the', 'time', 'phase', 'trail', 'been', 'for', 'selected', 'john'}


In [16]:
res = set(words).intersection(set(positive_words))
res 

{'congrats'}

In [17]:
res = set(words).intersection(set(negative_words))
res 

set()

## Accessing text from web :

In [18]:
URL = "https://www.gutenberg.org/files/67446/67446-h/67446-h.htm"

In [19]:
response = request.urlopen(URL)
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
text = soup.prettify()

In [20]:
print(soup.title.string)
print(soup.a.string)


      The Project Gutenberg eBook of Modern Whaling & Bear-Hunting, by W. G. Burn Murdoch.
    
www.gutenberg.org


## Extracting all text of a particular tag :

In [21]:
for x in soup.find_all('p') : print(x.text)

The Project Gutenberg eBook of Modern Whaling & Bear-Hunting, by W. G. Burn Murdoch
Title: Modern Whaling & Bear-Hunting
A record of present-day whaling with up-to-date appliances in many parts of the world, and of bear and seal hunting in the arctic regions
Author: W. G. Burn Murdoch
Release Date: February 19, 2022 [eBook #67446]
Language: English
Produced by: deaurider and the Online Distributed Proofreading Team at https://www.pgdp.net (This file was produced from images generously made available by The Internet Archive)
Lancing a Whale.
An eighteen-foot spear is the lance—half iron half wood. The pram is
swung out; and Jensen is handed the lance. We reach the whale and Jensen
makes a lunge, and the spear goes in five feet and is twisted out of his
hand; the vast body rolls over, the tail rises up and up and comes down
in a sea of foam.
[1]
MODERN WHALING
&
BEAR-HUNTING
A RECORD OF PRESENT-DAY WHALING WITH
UP-TO-DATE APPLIANCES IN MANY PARTS
OF THE WORLD, AND OF BEAR
AND SEAL HUNTIN

In [4]:
couplet = """Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date"""
print(couplet)

Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date


In [5]:
couplet = '''Rough winds do shake the darling buds of May
And Summer's lease hath all too short a date'''
print(couplet)

Rough winds do shake the darling buds of May
And Summer's lease hath all too short a date
