In [None]:
# PS : Text Analytics
    # 1. Extract Sample document and apply following document preprocessing
    # methods:Tokenization, POS Tagging, stop words removal, Stemming and
    # Lemmatization.
    # 2. Create representation of document by calculating Term Frequency and Inverse
    # Document Frequency.

In [5]:
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'spacy'

In [2]:
text="Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."
print(text)

Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.


In [None]:
# 1. Tokenization
tokens = word_tokenize(text)
print("\nTokenization:\n", tokens)

In [7]:
# 2. POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tagging:\n", pos_tags)


POS Tagging:
 [('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('first', 'JJ'), ('step', 'NN'), ('in', 'IN'), ('text', 'JJ'), ('analytics', 'NNS'), ('.', '.'), ('The', 'DT'), ('process', 'NN'), ('of', 'IN'), ('breaking', 'VBG'), ('down', 'RP'), ('a', 'DT'), ('text', 'NN'), ('paragraph', 'NN'), ('into', 'IN'), ('smaller', 'JJR'), ('chunks', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('words', 'NNS'), ('or', 'CC'), ('sentences', 'NNS'), ('is', 'VBZ'), ('called', 'VBN'), ('Tokenization', 'NN'), ('.', '.')]


In [8]:
# 3. Stop Words Removal
stop_words = set(stopwords.words('english'))
tokens_without_sw = [word for word in tokens if word.lower() not in stop_words]
print("\nTokens after Stop Words Removal:\n", tokens_without_sw)


Tokens after Stop Words Removal:
 ['Tokenization', 'first', 'step', 'text', 'analytics', '.', 'process', 'breaking', 'text', 'paragraph', 'smaller', 'chunks', 'words', 'sentences', 'called', 'Tokenization', '.']


In [9]:
# 4. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_without_sw]
print("\nStemming:\n", stemmed_tokens)


Stemming:
 ['token', 'first', 'step', 'text', 'analyt', '.', 'process', 'break', 'text', 'paragraph', 'smaller', 'chunk', 'word', 'sentenc', 'call', 'token', '.']


In [21]:
# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens_without_sw]
print("\nLemmatization:\n", lemmatized_tokens)


Lemmatization:
 ['Tokenization', 'first', 'step', 'text', 'analytics', '.', 'process', 'break', 'text', 'paragraph', 'smaller', 'chunk', 'word', 'sentence', 'call', 'Tokenization', '.']


In [None]:
# 6. TF-IDF Representation : Appropriate
documents = [text]  # List of one document

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names
terms = vectorizer.get_feature_names_out()

# Convert to array
tfidf_array = tfidf_matrix.toarray()

# Print TF-IDF scores
print("\nTF-IDF Representation:\n")
for i in range(len(terms)):
    print("Term:", terms[i], "TF-IDF:", round(tfidf_array[0][i], 4))


TF-IDF Representation:

Term: analytics TF-IDF: 0.1715
Term: as TF-IDF: 0.1715
Term: breaking TF-IDF: 0.1715
Term: called TF-IDF: 0.1715
Term: chunks TF-IDF: 0.1715
Term: down TF-IDF: 0.1715
Term: first TF-IDF: 0.1715
Term: in TF-IDF: 0.1715
Term: into TF-IDF: 0.1715
Term: is TF-IDF: 0.343
Term: of TF-IDF: 0.1715
Term: or TF-IDF: 0.1715
Term: paragraph TF-IDF: 0.1715
Term: process TF-IDF: 0.1715
Term: sentences TF-IDF: 0.1715
Term: smaller TF-IDF: 0.1715
Term: step TF-IDF: 0.1715
Term: such TF-IDF: 0.1715
Term: text TF-IDF: 0.343
Term: the TF-IDF: 0.343
Term: tokenization TF-IDF: 0.343
Term: words TF-IDF: 0.1715


In [None]:
# 6. TF-IDF Representation : Approx
def calculate_tf_idf(documents):
    # Tokenize the document
    tokens = word_tokenize(documents[0])
    
    # Calculate Term Frequency (TF)
    tf = {}
    for word in tokens:
        if word in tf:
            tf[word] += 1
        else:
            tf[word] = 1
    
    # Calculate Inverse Document Frequency (IDF)
    n_documents = len(documents)
    idf = {}
    for word in tokens:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = n_documents / count
    
    # Calculate TF-IDF
    tf_idf = {word: tf[word] * idf[word] for word in tf}
    
    return tf_idf

# Example usage
documents = [text]
tf_idf = calculate_tf_idf(documents)
print("\nCustom TF-IDF Representation:\n")
for word, score in tf_idf.items():
    print(f"Term: {word}, TF-IDF: {score:.4f}")




Custom TF-IDF Representation:

Term: Tokenization, TF-IDF: 2.0000
Term: is, TF-IDF: 2.0000
Term: the, TF-IDF: 1.0000
Term: first, TF-IDF: 1.0000
Term: step, TF-IDF: 1.0000
Term: in, TF-IDF: 1.0000
Term: text, TF-IDF: 2.0000
Term: analytics, TF-IDF: 1.0000
Term: ., TF-IDF: 2.0000
Term: The, TF-IDF: 1.0000
Term: process, TF-IDF: 1.0000
Term: of, TF-IDF: 1.0000
Term: breaking, TF-IDF: 1.0000
Term: down, TF-IDF: 1.0000
Term: a, TF-IDF: 1.0000
Term: paragraph, TF-IDF: 1.0000
Term: into, TF-IDF: 1.0000
Term: smaller, TF-IDF: 1.0000
Term: chunks, TF-IDF: 1.0000
Term: such, TF-IDF: 1.0000
Term: as, TF-IDF: 1.0000
Term: words, TF-IDF: 1.0000
Term: or, TF-IDF: 1.0000
Term: sentences, TF-IDF: 1.0000
Term: called, TF-IDF: 1.0000
