In [2]:
import re
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Import necessary libraries for text generation
from transformers import pipeline

# Initialize the text generation pipeline
generator = pipeline("text-generation")

def generate_documents(prompt, num_documents=3, max_length=50, model='gpt2'):
    generated_documents = []
    for _ in range(num_documents):
        # Generate text based on the prompt using the text generation model
        generated_text = generator(prompt, max_length=max_length, do_sample=True)[0]['generated_text']
        generated_documents.append(generated_text)

    return generated_documents

# Sample usage
prompt = "The quick brown fox"
generated_docs = generate_documents(prompt)
for i, doc in enumerate(generated_docs):
    print(f"Generated Document {i+1}:")
    print(doc)
    print()

def clean_text(text):
    # Remove non-alphanumeric characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

def normalize_text(text):
    return text.lower()

def tokenize_text(text):
    return word_tokenize(text)

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Generate documents
generated_documents = generate_documents(prompt)

#  1: Cleaning data
cleaned_documents = [clean_text(doc) for doc in generated_documents]

#  2: Normalization
normalized_documents = [normalize_text(doc) for doc in cleaned_documents]

#  3: Tokenization
tokenized_documents = [tokenize_text(doc) for doc in normalized_documents]

#  4: Lemmatization
lemmatized_documents = [lemmatize_text(tokens) for tokens in tokenized_documents]

#  5: Remove stop words
processed_documents = [remove_stopwords(tokens) for tokens in lemmatized_documents]

# Unique words
unique_words = set([word for doc in processed_documents for word in doc])

# TFIDF using scikit-learn
# Convert processed documents back to text
preprocessed_texts = [' '.join(doc) for doc in processed_documents]

# Calculate TFIDF using sklearn TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)

# Convert TFIDF matrix to numpy array
tfidf_array = tfidf_matrix.toarray()

# Print TFIDF feature vectors
for i, doc in enumerate(generated_documents):
    print(f"TFIDF Vector for Document {i+1}:")
    feature_names = vectorizer.get_feature_names_out()
    for word, tfidf_score in zip(feature_names, tfidf_array[i]):
        if tfidf_score > 0:
            print(f"{word}: {tfidf_score}")
    print("\n")

# TFIDF from scratch
# TF calculation
def calculate_tf(document, unique_words):
    word_counts = Counter(document)
    tf = {word: word_counts[word] / len(document) for word in unique_words}
    return tf

# IDF calculation
def calculate_idf(documents, unique_words):
    idf = {}
    num_documents = len(documents)
    for word in unique_words:
        num_documents_containing_word = sum(1 for doc in documents if word in doc)
        idf[word] = np.log(num_documents / (1 + num_documents_containing_word))
    return idf

# Calculate TF for each document
tf_documents = [calculate_tf(doc, unique_words) for doc in processed_documents]

# Calculate IDF for each word
idf_values = calculate_idf(processed_documents, unique_words)

# Multiply TF * IDF
tfidf_from_scratch = np.zeros((len(processed_documents), len(unique_words)))
for i, tf_doc in enumerate(tf_documents):
    for j, word in enumerate(unique_words):
        tfidf_from_scratch[i, j] = tf_doc[word] * idf_values[word]

# # Normalize TFIDF
# tfidf_from_scratch_normalized = tfidf_from_scratch / np.linalg.norm(tfidf_from_scratch, axis=1, keepdims=True)

# # Print normalized TFIDF feature vectors from scratch
# for i, doc in enumerate(generated_documents):
#     print(f"Normalized TFIDF Vector for Document {i+1} (From Scratch):")
#     for word, tfidf_score in zip(unique_words, tfidf_from_scratch_normalized[i]):
#         if tfidf_score > 0:
#             print(f"{word}: {tfidf_score}")
#     print("\n")

# TFIDF using built-in
# Convert TFIDF matrix to numpy array
tfidf_built_in_array = tfidf_matrix.toarray()

# Print TFIDF feature vectors using built-in
for i, doc in enumerate(generated_documents):
    print(f"TFIDF Vector for Document {i+1} (Built-in):")
    feature_names = vectorizer.get_feature_names_out()
    for word, tfidf_score in zip(feature_names, tfidf_built_in_array[i]):
        if tfidf_score > 0:
            print(f"{word}: {tfidf_score}")
print("\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Document 1:
The quick brown fox will turn into a wolf and then go through a lot of bad dog fight scenes. It's a funny little fox thing. The fox's just so shy with a lot of it. It's like this "good guy fox."

Generated Document 2:
The quick brown fox-like eyes of the mummifying mummified fox-like eyes of the witch-wanderer could easily be mistaken for the most beautiful little white fox on Earth, but now that they've taken a closer look,

Generated Document 3:
The quick brown foxes, the best known of the six-cornered sea lion species, have already been found, but it's very likely that the big blue fox, which is also larger by half to two, will end up on the



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TFIDF Vector for Document 1:
able: 0.19092575238770934
brown: 0.11276382501169671
brownie: 0.19092575238770934
enough: 0.19092575238770934
fluffy: 0.19092575238770934
fox: 0.572777257163128
foxhole: 0.19092575238770934
ha: 0.19092575238770934
huge: 0.19092575238770934
light: 0.19092575238770934
like: 0.14520395588865156
look: 0.14520395588865156
need: 0.19092575238770934
nice: 0.19092575238770934
quick: 0.11276382501169671
round: 0.19092575238770934
sit: 0.19092575238770934
something: 0.14520395588865156
two: 0.19092575238770934
walk: 0.19092575238770934
want: 0.19092575238770934
warm: 0.19092575238770934


TFIDF Vector for Document 2:
brown: 0.14569639337871368
foxy: 0.24668543766692175
good: 0.18761063377465192
ill: 0.24668543766692175
im: 0.24668543766692175
leave: 0.24668543766692175
look: 0.18761063377465192
new: 0.24668543766692175
nod: 0.24668543766692175
quick: 0.14569639337871368
said: 0.4933708753338435
see: 0.24668543766692175
something: 0.18761063377465192
thanks: 0.2466854