In [1]:
# Open the file in read mode
with open('sample.txt', 'r') as file:
    # Read the contents of the file into a string
    text = file.read()

# Print the contents of the string
print(text)

Text analytics, also known as text mining, is the process of deriving meaningful insights and patterns from unstructured text data. With the rise of the internet and social media, the amount of text data generated daily has skyrocketed, making text analytics an essential tool for businesses and organizations seeking to extract insights from this data.In this essay, we will explore the different techniques used in text analytics, the benefits of text analytics, and the challenges that come with implementing text analytics.Text analytics can be broadly divided into three main techniques: text classification, sentiment analysis, and topic modeling.Text classification involves categorizing text data into predefined categories. This can be useful for automating tasks such as spam detection, content filtering, and customer feedback analysis. For example, a company might use text classification to automatically route customer complaints to the appropriate department.Sentiment analysis, also k

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [9]:
# TOKENIZATION 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(text)


#STOP WORDS REMOVAL
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
for w in word_tokens:
	if w not in stop_words:
		filtered_sentence.append(w)

print(word_tokens)

['Text', 'analytics', ',', 'also', 'known', 'as', 'text', 'mining', ',', 'is', 'the', 'process', 'of', 'deriving', 'meaningful', 'insights', 'and', 'patterns', 'from', 'unstructured', 'text', 'data', '.', 'With', 'the', 'rise', 'of', 'the', 'internet', 'and', 'social', 'media', ',', 'the', 'amount', 'of', 'text', 'data', 'generated', 'daily', 'has', 'skyrocketed', ',', 'making', 'text', 'analytics', 'an', 'essential', 'tool', 'for', 'businesses', 'and', 'organizations', 'seeking', 'to', 'extract', 'insights', 'from', 'this', 'data.In', 'this', 'essay', ',', 'we', 'will', 'explore', 'the', 'different', 'techniques', 'used', 'in', 'text', 'analytics', ',', 'the', 'benefits', 'of', 'text', 'analytics', ',', 'and', 'the', 'challenges', 'that', 'come', 'with', 'implementing', 'text', 'analytics.Text', 'analytics', 'can', 'be', 'broadly', 'divided', 'into', 'three', 'main', 'techniques', ':', 'text', 'classification', ',', 'sentiment', 'analysis', ',', 'and', 'topic', 'modeling.Text', 'class

In [24]:
# POS TAGGING
pos_tagged = nltk.pos_tag(word_tokens)
print(pos_tagged)

[('Text', 'NN'), ('analytics', 'NNS'), (',', ','), ('also', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('text', 'NN'), ('mining', 'NN'), (',', ','), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('meaningful', 'JJ'), ('insights', 'NNS'), ('and', 'CC'), ('patterns', 'NNS'), ('from', 'IN'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('.', '.'), ('With', 'IN'), ('the', 'DT'), ('rise', 'NN'), ('of', 'IN'), ('the', 'DT'), ('internet', 'NN'), ('and', 'CC'), ('social', 'JJ'), ('media', 'NNS'), (',', ','), ('the', 'DT'), ('amount', 'NN'), ('of', 'IN'), ('text', 'NN'), ('data', 'NNS'), ('generated', 'VBD'), ('daily', 'RB'), ('has', 'VBZ'), ('skyrocketed', 'VBN'), (',', ','), ('making', 'VBG'), ('text', 'JJ'), ('analytics', 'NNS'), ('an', 'DT'), ('essential', 'JJ'), ('tool', 'NN'), ('for', 'IN'), ('businesses', 'NNS'), ('and', 'CC'), ('organizations', 'NNS'), ('seeking', 'VBG'), ('to', 'TO'), ('extract', 'VB'), ('insights', 'NNS'), ('from', 'IN'), 

In [10]:
print(filtered_sentence)

['Text', 'analytics', ',', 'also', 'known', 'text', 'mining', ',', 'process', 'deriving', 'meaningful', 'insights', 'patterns', 'unstructured', 'text', 'data', '.', 'With', 'rise', 'internet', 'social', 'media', ',', 'amount', 'text', 'data', 'generated', 'daily', 'skyrocketed', ',', 'making', 'text', 'analytics', 'essential', 'tool', 'businesses', 'organizations', 'seeking', 'extract', 'insights', 'data.In', 'essay', ',', 'explore', 'different', 'techniques', 'used', 'text', 'analytics', ',', 'benefits', 'text', 'analytics', ',', 'challenges', 'come', 'implementing', 'text', 'analytics.Text', 'analytics', 'broadly', 'divided', 'three', 'main', 'techniques', ':', 'text', 'classification', ',', 'sentiment', 'analysis', ',', 'topic', 'modeling.Text', 'classification', 'involves', 'categorizing', 'text', 'data', 'predefined', 'categories', '.', 'This', 'useful', 'automating', 'tasks', 'spam', 'detection', ',', 'content', 'filtering', ',', 'customer', 'feedback', 'analysis', '.', 'For', 'e

In [15]:
#STEMMING
from nltk.stem import PorterStemmer
ps = PorterStemmer()

stemmed = []

for w in filtered_sentence:
    stemmed.append(ps.stem(w))
    print(w, " : ", ps.stem(w))

Text  :  text
analytics  :  analyt
,  :  ,
also  :  also
known  :  known
text  :  text
mining  :  mine
,  :  ,
process  :  process
deriving  :  deriv
meaningful  :  meaning
insights  :  insight
patterns  :  pattern
unstructured  :  unstructur
text  :  text
data  :  data
.  :  .
With  :  with
rise  :  rise
internet  :  internet
social  :  social
media  :  media
,  :  ,
amount  :  amount
text  :  text
data  :  data
generated  :  gener
daily  :  daili
skyrocketed  :  skyrocket
,  :  ,
making  :  make
text  :  text
analytics  :  analyt
essential  :  essenti
tool  :  tool
businesses  :  busi
organizations  :  organ
seeking  :  seek
extract  :  extract
insights  :  insight
data.In  :  data.in
essay  :  essay
,  :  ,
explore  :  explor
different  :  differ
techniques  :  techniqu
used  :  use
text  :  text
analytics  :  analyt
,  :  ,
benefits  :  benefit
text  :  text
analytics  :  analyt
,  :  ,
challenges  :  challeng
come  :  come
implementing  :  implement
text  :  text
analytics.Text  :

In [25]:
#LEMMATIZATION
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized = []

for w in filtered_sentence:
    tp = lemmatizer.lemmatize(w,pos="a")
    lemmatized.append(tp)
    print(w,": ",tp)


Text :  Text
analytics :  analytics
, :  ,
also :  also
known :  known
text :  text
mining :  mining
, :  ,
process :  process
deriving :  deriving
meaningful :  meaningful
insights :  insights
patterns :  patterns
unstructured :  unstructured
text :  text
data :  data
. :  .
With :  With
rise :  rise
internet :  internet
social :  social
media :  media
, :  ,
amount :  amount
text :  text
data :  data
generated :  generated
daily :  daily
skyrocketed :  skyrocketed
, :  ,
making :  making
text :  text
analytics :  analytics
essential :  essential
tool :  tool
businesses :  businesses
organizations :  organizations
seeking :  seeking
extract :  extract
insights :  insights
data.In :  data.In
essay :  essay
, :  ,
explore :  explore
different :  different
techniques :  techniques
used :  used
text :  text
analytics :  analytics
, :  ,
benefits :  benefits
text :  text
analytics :  analytics
, :  ,
challenges :  challenges
come :  come
implementing :  implementing
text :  text
analytics.

In [32]:
#TERM FREQUENCY
total_frequency = len(lemmatized)
freq_dict = {}

for word in lemmatized:
    # If the word is already in the dictionary, increment its count
    if word in freq_dict:
        freq_dict[word] += 1
    # Otherwise, add the word to the dictionary with a count of 1
    else:
        freq_dict[word] = 1
print(freq_dict)
for key,value in freq_dict.items():
    value = float(value)/float(total_frequency)
    freq_dict[key] = value
print(freq_dict)

{'Text': 1, 'analytics': 14, ',': 38, 'also': 3, 'known': 2, 'text': 23, 'mining': 2, 'process': 2, 'deriving': 1, 'meaningful': 1, 'insights': 5, 'patterns': 1, 'unstructured': 1, 'data': 9, '.': 17, 'With': 1, 'rise': 1, 'internet': 1, 'social': 5, 'media': 5, 'amount': 1, 'generated': 2, 'daily': 1, 'skyrocketed': 1, 'making': 1, 'essential': 1, 'tool': 2, 'businesses': 6, 'organizations': 5, 'seeking': 2, 'extract': 1, 'data.In': 1, 'essay': 1, 'explore': 1, 'different': 1, 'techniques': 2, 'used': 1, 'benefits': 3, 'challenges': 4, 'come': 2, 'implementing': 3, 'analytics.Text': 1, 'broadly': 1, 'divided': 1, 'three': 1, 'main': 2, ':': 1, 'classification': 3, 'sentiment': 4, 'analysis': 5, 'topic': 2, 'modeling.Text': 1, 'involves': 3, 'categorizing': 1, 'predefined': 1, 'categories': 1, 'This': 3, 'useful': 3, 'automating': 1, 'tasks': 1, 'spam': 1, 'detection': 1, 'content': 1, 'filtering': 1, 'customer': 9, 'feedback': 3, 'For': 5, 'example': 5, 'company': 4, 'might': 5, 'use'

In [37]:
#IDF INVERSE DOCUMENT FREQUENCY
# Split main para into 3 separate documents
import math

length = len(lemmatized)

# Calculate the approximate length of each substring
substring_length = length // 3

# Split the array into three substrings
substring1 = lemmatized[:substring_length]
substring2 = lemmatized[substring_length:2*substring_length]
substring3 = lemmatized[2*substring_length:]

idf_dict={}
N = 3

for word in lemmatized:
    count=0
    if word in substring1:
        count+=1
    if word in substring2:
        count+=1
    if word in substring3:
        count+=1
    tp = float(N/count)
    ans = math.log10(tp)
    if word in idf_dict:
        pass
    else:
        idf_dict[word]=ans


In [38]:
idf_dict

{'Text': 0.47712125471966244,
 'analytics': 0.0,
 ',': 0.0,
 'also': 0.17609125905568124,
 'known': 0.47712125471966244,
 'text': 0.0,
 'mining': 0.47712125471966244,
 'process': 0.17609125905568124,
 'deriving': 0.47712125471966244,
 'meaningful': 0.47712125471966244,
 'insights': 0.0,
 'patterns': 0.47712125471966244,
 'unstructured': 0.47712125471966244,
 'data': 0.0,
 '.': 0.0,
 'With': 0.47712125471966244,
 'rise': 0.47712125471966244,
 'internet': 0.47712125471966244,
 'social': 0.0,
 'media': 0.0,
 'amount': 0.47712125471966244,
 'generated': 0.17609125905568124,
 'daily': 0.47712125471966244,
 'skyrocketed': 0.47712125471966244,
 'making': 0.47712125471966244,
 'essential': 0.47712125471966244,
 'tool': 0.17609125905568124,
 'businesses': 0.0,
 'organizations': 0.0,
 'seeking': 0.17609125905568124,
 'extract': 0.47712125471966244,
 'data.In': 0.47712125471966244,
 'essay': 0.47712125471966244,
 'explore': 0.47712125471966244,
 'different': 0.47712125471966244,
 'techniques': 0.