In [None]:
# !pip install transformers

from transformers import pipeline
import os
import string
import spacy
import numpy as np
from tabulate import tabulate

In [None]:
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

In [None]:
# # Load the text generation pipeline with the specified model
# gen = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B')

# def write_prompt_to_file(prompt, filename):
#     with open(filename, 'w') as file:
#         file.write(prompt)

# def generate_document(prompt_files, num_phrases):
#     for prompt_file in prompt_files:
#         with open(prompt_file, 'r') as file:
#             prompt = file.read().strip()

#         output_file = os.path.splitext(prompt_file)[0] + '_generated.txt'
#         with open(output_file, 'w') as file:
#             generated_text = gen(prompt, max_length=50, num_return_sequences=num_phrases)
#             for i, text in enumerate(generated_text):
#                 # Write only the generated text to the file
#                 file.write(f"{text['generated_text']}\n\n")


# # Directory to store prompt files
# prompt_dir = "prompts"
# # Number of phrases to generate for each prompt
# num_phrases = 10

# # Ensure the prompt directory exists
# if not os.path.exists(prompt_dir):
#   os.makedirs(prompt_dir)

# # List of prompts
# prompts = ["Machine Learning", "Smart Agiricluture", "Smart cities"]

# # Write each prompt to a separate file
# prompt_files = []
# for i, prompt in enumerate(prompts):
#   filename = os.path.join(prompt_dir, f"prompt_{i+1}.txt")
#   write_prompt_to_file(prompt, filename)
#   prompt_files.append(filename)

# # Generate documents based on the prompts
# generate_document(prompt_files, num_phrases)
# print("Documents generated successfully.")

In [None]:
# Function to clean data from symbols and characters
def clean_data(text):
    # Remove punctuation
    cleaned_text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text


In [None]:
# Function for tokenization using NLTK
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

In [None]:
# Function for lemmatization using NLTK
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [None]:
# Function to remove stop words using NLTK
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

In [None]:
# Function to extract unique words
def get_unique_words(tokens):
    unique_words = set(tokens)
    return unique_words

In [None]:
# Function to compute TF
def compute_tf(tokens):
    word_count = len(tokens)
    tf_dict = {}
    for word in tokens:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    for word, count in tf_dict.items():
        tf_dict[word] = count / word_count
    return tf_dict

In [None]:
# Function to compute IDF
def compute_idf(docs):
    word_doc_count = {}
    for doc in docs:
        unique_words = set(doc)
        for word in unique_words:
            word_doc_count[word] = word_doc_count.get(word, 0) + 1
    num_docs = len(docs)
    idf_dict = {}
    for word, doc_count in word_doc_count.items():
        idf_dict[word] = np.log((num_docs + 1) / (doc_count + 1)) + 1
    return idf_dict

In [None]:
# Function to compute TF-IDF
def compute_tfidf(tf_dicts, idf_dict):
    tfidf_matrix = []
    for tf_dict in tf_dicts:
        tfidf_vector = {}
        for word, tf in tf_dict.items():
            tfidf = tf * idf_dict.get(word, 0)
            tfidf_vector[word] = tfidf
        tfidf_matrix.append(tfidf_vector)
    return tfidf_matrix

In [None]:
def normalize_tfidf(tfidf_matrix):
    normalized_tfidf_matrix = []
    for tfidf_vector in tfidf_matrix:
        # Compute L2 norm of the TF-IDF vector
        norm = np.linalg.norm(list(tfidf_vector.values()), ord=2)
        # Normalize each TF-IDF value by dividing by the norm
        normalized_tfidf_vector = {word: tfidf_value / norm for word, tfidf_value in tfidf_vector.items()}
        normalized_tfidf_matrix.append(normalized_tfidf_vector)
    return normalized_tfidf_matrix

In [None]:
# Modify process_prompts function to use normalized TF-IDF
def process_prompts(prompt_files):
    docs = []
    tf_dicts = []
    for prompt_file in prompt_files:
        with open(prompt_file, 'r') as file:
            prompt = file.read().strip()

        # Step 1: Cleaning data
        cleaned_prompt = clean_data(prompt)

        # Step 2: Normalization (converting to lowercase)
        normalized_prompt = cleaned_prompt.lower()

        # Step 3: Tokenization
        tokens = tokenize(normalized_prompt)

        # Step 4: Lemmatization
        lemmatized_tokens = lemmatize(tokens)

        # Step 5: Remove stop words
        filtered_tokens = remove_stopwords(lemmatized_tokens)

        # Get unique words
        unique_words = get_unique_words(filtered_tokens)

        # Compute TF
        tf_dict = compute_tf(filtered_tokens)
        tf_dicts.append(tf_dict)

        # Add document to docs
        docs.append(filtered_tokens)

        # Print or use the processed prompt and its unique words as needed
        print(f"Unique words: {unique_words}")
        print()

    # Compute IDF
    idf_dict = compute_idf(docs)

    # Compute TF-IDF
    tfidf_matrix = compute_tfidf(tf_dicts, idf_dict)

    # Normalize TF-IDF
    normalized_tfidf_matrix = normalize_tfidf(tfidf_matrix)

    table_data = []
    header = ["Document"] + sorted(list(unique_words))
    for i, normalized_tfidf_vector in enumerate(normalized_tfidf_matrix):
        row = [f"Document {i+1}"]
        for word in sorted(list(unique_words)):
            row.append(f"{normalized_tfidf_vector.get(word, 0):.3f}")
        table_data.append(row)

    print("Normalized TF-IDF Matrix (Document * Word):")
    print(tabulate(table_data, headers=header, tablefmt="grid"))

In [None]:
# Directory containing prompt files
prompt_dir = "/content/data"

# Get list of prompt files
prompt_files = [os.path.join(prompt_dir, file) for file in os.listdir(prompt_dir) if file.endswith('.txt')]

# Process each prompt
processed_prompts = process_prompts(prompt_files)

Unique words: {'population', 'focus', '”', 'reduce', 'one', 'every', 'case', 'easy', 'problem', 'extent', 'leveraging', 'career', 'challenge', 'created', 'big', 'ha', 'detroit', 'growth', 'topic', 'around', 'continue', 'often', 'may', 'got', 'rising', 'conversation', 'let', 'experienced', 'smart', 'also', 'global', 'attribute', 'understand', 'burdened', 'weve', 'show', 'ottawa', 'interesting', 'lot', 'care', 'actively', 'environment', '—', 'developed', 'future', 'facing', 'key', 'exemplar', 'meet', 'hand', 'shift', 'healthier', 'sustainable', 'approach', 'etc', 'idea', 'energy', 'used', 'impact', '5', 'sharing', 'many', 'pressure', 'define', 'make', '“', 'group', 'enlivened', 'definition', 'extensively', 'site', 'place', '1980s', 'destiny', 'time', 'tough', 'research', 'concept', 'embrace', 'designer', 'barrier', '‘', 'always', 'mixed', 'international', 'order', 'people', 'find', 'term', 'dilemma', 'city', 'value', 'river', 'economy', 'known', 'despite', 'buzz', 'resident', 'past', 'ra

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def process2_prompts(prompt_files):
    docs = []
    for prompt_file in prompt_files:
        with open(prompt_file, 'r') as file:
            prompt = file.read().strip()

        # Step 1: Cleaning data
        cleaned_prompt = clean_data(prompt)

        # Step 2: Normalization (converting to lowercase)
        normalized_prompt = cleaned_prompt.lower()

        # Step 3: Tokenization
        tokens = tokenize(normalized_prompt)

        # Step 4: Lemmatization
        lemmatized_tokens = lemmatize(tokens)

        # Step 5: Remove stop words
        filtered_tokens = remove_stopwords(lemmatized_tokens)

        # Join tokens back into a sentence
        processed_prompt = ' '.join(filtered_tokens)

        # Add document to docs
        docs.append(processed_prompt)

    # Compute TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs)


    return tfidf_matrix, tfidf_vectorizer.get_feature_names_out()

In [None]:
# Process prompts and get TF-IDF matrix
tfidf_matrix, feature_names = process2_prompts(prompt_files)

In [None]:
# Print Normalized TF-IDF Matrix (Document * Word)
print("Normalized TF-IDF Matrix (Document * Word):")

# Print headers
header = "| Document ".ljust(12) + "|"
for word in feature_names:
    header += word.ljust(10) + "|"
print(header)
print("-" * len(header))

# Print matrix
for i, row in enumerate(tfidf_matrix.toarray()):
    doc = f"| Document {i + 1} ".ljust(12) + "|"
    values = [f"{value:.3f}".ljust(10) for value in row]
    print(doc + "|".join(values) + "|")

Normalized TF-IDF Matrix (Document * Word):
| Document  |10        |1980s     |2006      |2013      |2015      |3m        |ability   |academia  |accurate  |acted     |action    |actively  |agiricluture|agiricluturel|agiricluturets|agitrack  |agoristatic|agrawalamit|ai        |algorithm |already   |also      |always    |analyzed  |ancient   |andor     |answer    |approach  |archive   |around    |art       |article   |artificial|attribute |august    |automation|avoid     |aware     |barrier   |based     |behind    |beneficial|benefit   |big       |birth     |bit       |blog      |boy       |branch    |break     |broader   |building  |burdened  |business  |buzz      |called    |care      |career    |case      |category  |centre    |challenge |christian |city      |citysolutions|class     |classification|close     |closure   |college   |comfort   |commentary|community |computational|computer  |computergenerated|concept   |concerned |configure |construction|continue  |conversation|cost     