In [21]:
import random
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import numpy as np
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# ***Generate a document***

In [22]:
# Function to generate a document based on a given phrase
def generate_document(phrase, num_sentences=5):
    document = ""
    for _ in range(num_sentences):
        document += phrase + ". "
    return document

# Phrases for different fields/topics
phrases = {
    "technology": "Artificial intelligence is revolutionizing the tech industry",
    "finance": "The stock market is influenced by various factors including economic indicators",
    "healthcare": "Advancements in medical technology have improved patient care",
}

# Number of documents to generate for each field
num_documents_per_field = 3

# Generate documents for each field
documents = {}
for field, phrase in phrases.items():
    documents[field] = [generate_document(phrase) for _ in range(num_documents_per_field)]

# ***Data processing steps***

In [23]:
def process_data(document):
    # Clean data by removing symbols and non-alphanumeric characters
    document = re.sub(r'[^\w\s]', '', document)
    # Normalize data by converting to lowercase
    document = document.lower()
    # Tokenization: split the data into words
    tokens = word_tokenize(document)

    # Initialize Porter Stemmer for stemming
    stemmer = PorterStemmer()
    # Perform stemming
    stemmed_words = [stemmer.stem(word) for word in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in stemmed_words if word not in stop_words]

    return filtered_words

# ***Get unique words***



In [24]:
# Concatenate all documents
all_docs = []
for field, docs in documents.items():
    all_docs.extend(docs)

# Process documents
processed_docs = [process_data(doc) for doc in all_docs]

# Join processed docs into strings for TfidfVectorizer
doc_strings = [' '.join(doc) for doc in processed_docs]

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print feature names and TF-IDF matrix
print("Unique Words:", feature_names)

Unique Words: ['advanc' 'artifici' 'care' 'econom' 'factor' 'improv' 'includ' 'indic'
 'industri' 'influenc' 'intellig' 'market' 'medic' 'patient' 'revolution'
 'stock' 'tech' 'technolog' 'variou']


# ***Calculate TF-IDF using Scikit-learn***

In [28]:

# Calculate TF-IDF using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_sklearn = tfidf_vectorizer.fit_transform(doc_strings)

# Convert TF-IDF matrix to array for easier manipulation
tfidf_sklearn = tfidf_sklearn.toarray()

# Print TF-IDF from scikit-learn
print("\nTF-IDF from scikit-learn:")
print(np.round(tfidf_sklearn, 2))


TF-IDF from scikit-learn:
[[0.   0.45 0.   0.   0.   0.   0.   0.   0.45 0.   0.45 0.   0.   0.
  0.45 0.   0.45 0.   0.  ]
 [0.   0.45 0.   0.   0.   0.   0.   0.   0.45 0.   0.45 0.   0.   0.
  0.45 0.   0.45 0.   0.  ]
 [0.   0.45 0.   0.   0.   0.   0.   0.   0.45 0.   0.45 0.   0.   0.
  0.45 0.   0.45 0.   0.  ]
 [0.   0.   0.   0.35 0.35 0.   0.35 0.35 0.   0.35 0.   0.35 0.   0.
  0.   0.35 0.   0.   0.35]
 [0.   0.   0.   0.35 0.35 0.   0.35 0.35 0.   0.35 0.   0.35 0.   0.
  0.   0.35 0.   0.   0.35]
 [0.   0.   0.   0.35 0.35 0.   0.35 0.35 0.   0.35 0.   0.35 0.   0.
  0.   0.35 0.   0.   0.35]
 [0.41 0.   0.41 0.   0.   0.41 0.   0.   0.   0.   0.   0.   0.41 0.41
  0.   0.   0.   0.41 0.  ]
 [0.41 0.   0.41 0.   0.   0.41 0.   0.   0.   0.   0.   0.   0.41 0.41
  0.   0.   0.   0.41 0.  ]
 [0.41 0.   0.41 0.   0.   0.41 0.   0.   0.   0.   0.   0.   0.41 0.41
  0.   0.   0.   0.41 0.  ]]


# **Calculate TF-IDF from Scratch**


## *1.Calculate term frequency (TF)*




In [None]:
def calculate_tf(document):
    word_counts = Counter(document)
    total_words = len(document)
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

## *2.Calculate inverse document frequency (IDF)*

In [None]:
def calculate_idf(documents):
    num_documents = len(documents)
    idf = {}
    all_words = set([word for document in documents for word in document])

    for word in all_words:
        num_documents_containing_word = sum([1 for document in documents if word in document])
        idf[word] = np.log((1 + num_documents) / (1 + num_documents_containing_word)) + 1

    return idf

## *3.Calculate TF-IDF*


In [None]:
def calculate_tfidf(documents):
    tfidf = []
    idf = calculate_idf(documents)

    for document in documents:
        tf = calculate_tf(document)
        doc_tfidf = {word: tf[word] * idf[word] for word in document}
        tfidf.append(doc_tfidf)

    return tfidf


## *4.Normalize TF-IDF vectors*

In [None]:
def normalize_tfidf(tfidf):
    normalized_tfidf = []
    for doc in tfidf:
        norm = np.linalg.norm(list(doc.values()))
        if norm > 0:
            doc_tfidf_normalized = {word: value / norm for word, value in doc.items()}
            normalized_tfidf.append(doc_tfidf_normalized)
    return normalized_tfidf


## *5.Fit and transform documents using TFIDF From scratch*

In [30]:
def fit_custom_tfidf(documents):
    tfidf = []
    idf = {}
    vocab = {}

    doc_count = len(documents)

    # Compute term frequencies and document frequencies for IDF
    tf = []
    for document in documents:
        doc_tf = {}
        for word in document:
            doc_tf[word] = doc_tf.get(word, 0) + 1
        for word in doc_tf:
            doc_tf[word] = doc_tf[word] / len(document)
            idf[word] = idf.get(word, 0) + 1
        tf.append(doc_tf)

    # Sort the vocabulary alphabetically and assign indices
    sorted_vocab = sorted(idf.keys())
    vocab = {word: idx for idx, word in enumerate(sorted_vocab)}

    # Compute IDF using the sorted vocabulary
    for word in idf:
        idf[word] = np.log((1 + doc_count) / (1 + idf[word])) + 1

    # Compute TF-IDF scores using the sorted vocabulary
    for doc in tf:
        doc_tfidf = np.zeros(len(vocab))
        for word, value in doc.items():
            if word in vocab:
                index = vocab[word]
                doc_tfidf[index] = value * idf[word]
        # Normalization
        norm = np.linalg.norm(doc_tfidf)
        if norm > 0:
            doc_tfidf = doc_tfidf / norm
        tfidf.append(doc_tfidf)

    return np.array(tfidf)

# Process documents
processed_docs = [process_data(doc) for doc in all_docs]

# Calculate TF-IDF
tfidf_scratch= fit_custom_tfidf(processed_docs)

# Print TF-IDF from CustomTFIDF
print("TF-IDF from Scratch:")
print(tfidf_scratch)

TF-IDF from Scratch:
[[0.         0.4472136  0.         0.         0.         0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.        ]
 [0.         0.4472136  0.         0.         0.         0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.        ]
 [0.         0.4472136  0.         0.         0.         0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.         0.         0.4472136  0.         0.4472136  0.
  0.        ]
 [0.         0.         0.         0.35355339 0.35355339 0.
  0.35355339 0.35355339 0.         0.35355339 0.         0.35355339
  0.         0.         0.         0.35355339 0.         0.
  0.35355339]
 [0.         0.         0.         0.35355339 0.35355339 0.
  0.35355339 0.35355339 0.         0.35355339 0.         0.35355339
  0.         0.         0.         0.35355339 0.         0.
  0.353