## Text Analytics

1) Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2) Create representation of document by calculating Term Frequency and Inverse Document Frequency.

In [72]:
# Import Libraries

import nltk
import string
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

In [73]:
# Download necessary tools

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [74]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

### Read Text

In [75]:
text = open('../Datasets/SampleText.txt', 'r')
text = text.read()
print(text)

python is a high-level, interpreted programming language created by guido van rossum and first released in 1991. it is designed with an emphasis on code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in languages such as c++ or java.\n\npython supports multiple programming paradigms, including procedural, object-oriented, and functional programming. in simpler terms, this means its flexible and allows you to write code in different ways, whether that's like giving the computer a to-do list (procedural), creating digital models of things or concepts (object-oriented), or treating your code like a math problem (functional).


### Tokenization
- Breaking a text into smaller units, typically words or phrases, to facilitate further analysis.
- Word Tokenization : splits a piece of text into individual words based on a certain delimiter
- Character Tokenization 

In [76]:
tokens = word_tokenize(text)
print(tokens)

['python', 'is', 'a', 'high-level', ',', 'interpreted', 'programming', 'language', 'created', 'by', 'guido', 'van', 'rossum', 'and', 'first', 'released', 'in', '1991.', 'it', 'is', 'designed', 'with', 'an', 'emphasis', 'on', 'code', 'readability', ',', 'and', 'its', 'syntax', 'allows', 'programmers', 'to', 'express', 'concepts', 'in', 'fewer', 'lines', 'of', 'code', 'than', 'would', 'be', 'possible', 'in', 'languages', 'such', 'as', 'c++', 'or', 'java.\\n\\npython', 'supports', 'multiple', 'programming', 'paradigms', ',', 'including', 'procedural', ',', 'object-oriented', ',', 'and', 'functional', 'programming', '.', 'in', 'simpler', 'terms', ',', 'this', 'means', 'its', 'flexible', 'and', 'allows', 'you', 'to', 'write', 'code', 'in', 'different', 'ways', ',', 'whether', 'that', "'s", 'like', 'giving', 'the', 'computer', 'a', 'to-do', 'list', '(', 'procedural', ')', ',', 'creating', 'digital', 'models', 'of', 'things', 'or', 'concepts', '(', 'object-oriented', ')', ',', 'or', 'treating

### POS (Parts of Speech) Tagging
- Assigning grammatical categories (like noun, verb, adjective) to each word in a text based on its context and meaning.

In [77]:
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('python', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), (',', ','), ('interpreted', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('created', 'VBN'), ('by', 'IN'), ('guido', 'NN'), ('van', 'NN'), ('rossum', 'NN'), ('and', 'CC'), ('first', 'JJ'), ('released', 'VBN'), ('in', 'IN'), ('1991.', 'CD'), ('it', 'PRP'), ('is', 'VBZ'), ('designed', 'VBN'), ('with', 'IN'), ('an', 'DT'), ('emphasis', 'NN'), ('on', 'IN'), ('code', 'NN'), ('readability', 'NN'), (',', ','), ('and', 'CC'), ('its', 'PRP$'), ('syntax', 'NN'), ('allows', 'VBZ'), ('programmers', 'NNS'), ('to', 'TO'), ('express', 'VB'), ('concepts', 'NNS'), ('in', 'IN'), ('fewer', 'JJR'), ('lines', 'NNS'), ('of', 'IN'), ('code', 'NN'), ('than', 'IN'), ('would', 'MD'), ('be', 'VB'), ('possible', 'JJ'), ('in', 'IN'), ('languages', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('c++', 'NN'), ('or', 'CC'), ('java.\\n\\npython', 'NN'), ('supports', 'NNS'), ('multiple', 'JJ'), ('programming', 'VBG'), ('paradigms', 'NN'), (',', ','), ('i

### Stop Words Removal
- Eliminating common words (like "the", "is", "and") from a text
- dosen't carry significant meaning for analysis purposes.

In [78]:
stop_words = set(stopwords.words('english'))
filtered_tokens = []

for word in tokens:
    lower_word = word.lower()
    if lower_word not in stop_words:
        filtered_tokens.append(word)
print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['python', 'high-level', ',', 'interpreted', 'programming', 'language', 'created', 'guido', 'van', 'rossum', 'first', 'released', '1991.', 'designed', 'emphasis', 'code', 'readability', ',', 'syntax', 'allows', 'programmers', 'express', 'concepts', 'fewer', 'lines', 'code', 'would', 'possible', 'languages', 'c++', 'java.\\n\\npython', 'supports', 'multiple', 'programming', 'paradigms', ',', 'including', 'procedural', ',', 'object-oriented', ',', 'functional', 'programming', '.', 'simpler', 'terms', ',', 'means', 'flexible', 'allows', 'write', 'code', 'different', 'ways', ',', 'whether', "'s", 'like', 'giving', 'computer', 'to-do', 'list', '(', 'procedural', ')', ',', 'creating', 'digital', 'models', 'things', 'concepts', '(', 'object-oriented', ')', ',', 'treating', 'code', 'like', 'math', 'problem', '(', 'functional', ')', '.']


In [79]:
# Puncuation Removal
new_filtered_tokens = []

for word in filtered_tokens:
    if word not in string.punctuation:
        new_filtered_tokens.append(word)

print(new_filtered_tokens)

['python', 'high-level', 'interpreted', 'programming', 'language', 'created', 'guido', 'van', 'rossum', 'first', 'released', '1991.', 'designed', 'emphasis', 'code', 'readability', 'syntax', 'allows', 'programmers', 'express', 'concepts', 'fewer', 'lines', 'code', 'would', 'possible', 'languages', 'c++', 'java.\\n\\npython', 'supports', 'multiple', 'programming', 'paradigms', 'including', 'procedural', 'object-oriented', 'functional', 'programming', 'simpler', 'terms', 'means', 'flexible', 'allows', 'write', 'code', 'different', 'ways', 'whether', "'s", 'like', 'giving', 'computer', 'to-do', 'list', 'procedural', 'creating', 'digital', 'models', 'things', 'concepts', 'object-oriented', 'treating', 'code', 'like', 'math', 'problem', 'functional']


### Stemming
- Reducing words to their base or root form, typically by removing suffixes, to normalize variations of words.




In [80]:
porter = PorterStemmer()

stemmed_words = []

for word in new_filtered_tokens:
    stem = porter.stem(word)
    stemmed_words.append(stem)

print(stemmed_words)

['python', 'high-level', 'interpret', 'program', 'languag', 'creat', 'guido', 'van', 'rossum', 'first', 'releas', '1991.', 'design', 'emphasi', 'code', 'readabl', 'syntax', 'allow', 'programm', 'express', 'concept', 'fewer', 'line', 'code', 'would', 'possibl', 'languag', 'c++', 'java.\\n\\npython', 'support', 'multipl', 'program', 'paradigm', 'includ', 'procedur', 'object-ori', 'function', 'program', 'simpler', 'term', 'mean', 'flexibl', 'allow', 'write', 'code', 'differ', 'way', 'whether', "'s", 'like', 'give', 'comput', 'to-do', 'list', 'procedur', 'creat', 'digit', 'model', 'thing', 'concept', 'object-ori', 'treat', 'code', 'like', 'math', 'problem', 'function']


### Lemmetization
- Similar to stemming but aims to return the base or dictionary form of a word (lemma), considering its morphological variations.

In [81]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = []

for word in new_filtered_tokens:
    lemma = lemmatizer.lemmatize(word)
    lemmatized_words.append(lemma)

print(lemmatized_words)

['python', 'high-level', 'interpreted', 'programming', 'language', 'created', 'guido', 'van', 'rossum', 'first', 'released', '1991.', 'designed', 'emphasis', 'code', 'readability', 'syntax', 'allows', 'programmer', 'express', 'concept', 'fewer', 'line', 'code', 'would', 'possible', 'language', 'c++', 'java.\\n\\npython', 'support', 'multiple', 'programming', 'paradigm', 'including', 'procedural', 'object-oriented', 'functional', 'programming', 'simpler', 'term', 'mean', 'flexible', 'allows', 'write', 'code', 'different', 'way', 'whether', "'s", 'like', 'giving', 'computer', 'to-do', 'list', 'procedural', 'creating', 'digital', 'model', 'thing', 'concept', 'object-oriented', 'treating', 'code', 'like', 'math', 'problem', 'functional']


## Representation of Document

#### Term Frequency (TF)
Measuring how frequently a term occurs in a document relative to the total number of terms in that document.

In [87]:
from nltk.probability import FreqDist

tf = FreqDist(new_filtered_tokens)
print("Term Frequency:", tf)

# Display Output
print("Term Frequency:")
for word, freq in tf.items():
    print(f"{word}: {freq}")


Term Frequency: <FreqDist with 56 samples and 67 outcomes>
Term Frequency:
python: 1
high-level: 1
interpreted: 1
programming: 3
language: 1
created: 1
guido: 1
van: 1
rossum: 1
first: 1
released: 1
1991.: 1
designed: 1
emphasis: 1
code: 4
readability: 1
syntax: 1
allows: 2
programmers: 1
express: 1
concepts: 2
fewer: 1
lines: 1
would: 1
possible: 1
languages: 1
c++: 1
java.\n\npython: 1
supports: 1
multiple: 1
paradigms: 1
including: 1
procedural: 2
object-oriented: 2
functional: 2
simpler: 1
terms: 1
means: 1
flexible: 1
write: 1
different: 1
ways: 1
whether: 1
's: 1
like: 2
giving: 1
computer: 1
to-do: 1
list: 1
creating: 1
digital: 1
models: 1
things: 1
treating: 1
math: 1
problem: 1


#### Inverse Document Frequency (IDF)
Measuring the rarity or commonness of a term across all documents in a corpus.

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [text]
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)
idf = tfidf_vectorizer.idf_

tfidf = {}

for i, freq in tf.items():
    tfidf_value = freq * idf[]  
    tfidf[word] = float(tfidf_value)

print("TF-IDF:", tfidf)

TF-IDF: {'python': 1.0, 'high-level': 1.0, 'interpreted': 1.0, 'programming': 3.0, 'language': 1.0, 'created': 1.0, 'guido': 1.0, 'van': 1.0, 'rossum': 1.0, 'first': 1.0, 'released': 1.0, '1991.': 1.0, 'designed': 1.0, 'emphasis': 1.0, 'code': 4.0, 'readability': 1.0, 'syntax': 1.0, 'allows': 2.0, 'programmers': 1.0, 'express': 1.0, 'concepts': 2.0, 'fewer': 1.0, 'lines': 1.0, 'would': 1.0, 'possible': 1.0, 'languages': 1.0, 'c++': 1.0, 'java.\\n\\npython': 1.0, 'supports': 1.0, 'multiple': 1.0, 'paradigms': 1.0, 'including': 1.0, 'procedural': 2.0, 'object-oriented': 2.0, 'functional': 2.0, 'simpler': 1.0, 'terms': 1.0, 'means': 1.0, 'flexible': 1.0, 'write': 1.0, 'different': 1.0, 'ways': 1.0, 'whether': 1.0, "'s": 1.0, 'like': 2.0, 'giving': 1.0, 'computer': 1.0, 'to-do': 1.0, 'list': 1.0, 'creating': 1.0, 'digital': 1.0, 'models': 1.0, 'things': 1.0, 'treating': 1.0, 'math': 1.0, 'problem': 1.0}
