# Experiment - 7: Perform the steps involved in Text Analytics in Python & R

**LabOutcomes (LO):
Design Text Analytics Application on a given data set. (LO4) **


**Tokenization**


In [5]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install tokenizers



In [None]:
!pip install nltk

import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Tokenization is the process of breaking down text into smaller units called tokens."

# Tokenize the text into words
tokens = word_tokenize(text)

# Print the tokens
print(tokens)

['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units', 'called', 'tokens', '.']


Frequency Distribution

In [13]:
from nltk import FreqDist


In [14]:
freq_dist = FreqDist(tokens)

# Print the frequency distribution
print(freq_dist.most_common())

[('Tokenization', 1), ('is', 1), ('the', 1), ('process', 1), ('of', 1), ('breaking', 1), ('down', 1), ('text', 1), ('into', 1), ('smaller', 1), ('units', 1), ('called', 1), ('tokens', 1), ('.', 1)]


**Remove stopwords & punctuations**

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
text = "Tokenization is the process of breaking down text into smaller units called tokens,It is important in natural language processing."

# Tokenize the text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Remove punctuation
filtered_tokens = [word for word in filtered_tokens if word not in string.punctuation]

# Print filtered tokens
print(filtered_tokens)

['Tokenization', 'process', 'breaking', 'text', 'smaller', 'units', 'called', 'tokens', 'important', 'natural', 'language', 'processing']


**Lexicon Normalization (Stemming, Lemmatization)**

In [21]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "running runner runs"
tokens = word_tokenize(text)

stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in tokens]
print(stems)

['run', 'runner', 'run']


In [25]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download WordNet corpus if not already downloaded
nltk.download('wordnet')

# Input text
text = "The cats are running and jumping on the beds."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each token
lemmas = [lemmatizer.lemmatize(word) for word in tokens]

# Print the lemmatized tokens
print(lemmas)

['The', 'cat', 'are', 'running', 'and', 'jumping', 'on', 'the', 'bed', '.']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Part of Speech tagging**

In [30]:
import nltk
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)

# Print POS tagged tokens
print(pos_tags)

[('The', 'DT'), ('cats', 'NNS'), ('are', 'VBP'), ('running', 'VBG'), ('and', 'CC'), ('jumping', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('beds', 'NNS'), ('.', '.')]


In [29]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

**Named Entity Recognization**

In [34]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [38]:
import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Apple is headquartered in Cupertino, California."

# Tokenize the text
tokens = word_tokenize(text)

# Perform Named Entity Recognition
ne_tags = nltk.pos_tag(tokens)
ne_chunks = nltk.ne_chunk(ne_tags)

# Print Named Entities
for chunk in ne_chunks:
    if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Apple
GPE Cupertino
GPE California


**Scrape data from a website**

In [40]:
import requests
from bs4 import BeautifulSoup

# URL of the website to scrape
url = "https://en.wikipedia.org/wiki/Main_Page"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find and extract the desired data from the webpage
    # For example, if you want to scrape all the links on the webpage:
    links = soup.find_all('a')

    # Print the extracted links
    for link in links:
        print(link.get('href'))
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)


#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Main+Page
/w/index.php?title=Special:UserLogin&returnto=Main+Page
/w/index.php?title=Special:CreateAccount&returnto=Main+Page
/w/index.php?title=Special:UserLogin&returnto=Main+Page
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Main_Page
/wiki/Talk:Main_Page
/wiki/Main_Page
/w/index.php?title=Main_Page&action=edit
/w/index.php?title=Main_Page&action=history
/wiki/Main_Page
/w/index.php?title=Main_Page&action=edit
/w/ind

# Conclusion: We have successfully perormed text analysis.