In [1]:
!pip install transformers sentence-transformers gensim nltk scikit-learn




In [2]:
import numpy as np

import nltk
# Download the 'punkt_tab' data package
nltk.download('punkt_tab')

from transformers import AutoModelForCausalLM, AutoTokenizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
# List of labels.   #everything must be in lower case.   # underscore needed for more than one word as input label

#labels = ["Hypolimnas", "Misippus", "Danaus", "Chrysippus", "Amauris", "Ochlea", "Acraea", "Egina"]
labels = ["hypolimnas_misippus", "danaus_chrysippus", "amauris_ochlea", "acraea_egina"]
#labels = ["cat", "dog"]

# Generate Contextual Paragraphs Using an LLM

In [10]:
# Load the Qwen2.5-1.5B-Instruct model and tokenizer        #most download
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).half().cuda()

# Function to generate text for a label
def generate_paragraph(label):
    prompt = f"Write a descriptive paragraph about {label}."
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=250, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Generate paragraphs for each label
contextual_paragraphs_Qwen1_5B = {label: generate_paragraph(label) for label in labels}

# Print the generated paragraphs
for label, paragraph in contextual_paragraphs_Qwen1_5B.items():
    print(f"{label}: {paragraph}\n")


hypolimnas_misippus: Write a descriptive paragraph about hypolimnas_misippus. Hypolimnas misippus, commonly known as the blue morpho butterfly, is a species of large, colorful butterflies native to South America. With its iridescent wings that can change color in response to light and temperature, this butterfly is often mistaken for being electrically charged due to its striking appearance. The blue morpho's wings are typically emerald green on top with white spots, but when viewed from below, they appear black. This unique trait has earned it the nickname "blue diamond." In addition to its beautiful colors, the blue morpho is also known for its graceful flight and impressive size - adults can grow up to 12 inches across their wingspan! These butterflies play an important role in their ecosystems by pollinating plants and serving as food sources for other animals like hummingbirds. Despite their remarkable beauty, blue morphos face threats such as habitat loss and pollution, making co

In [5]:
'''
from transformers import pipeline

# Load a GPT-Neo model for text generation
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B", device=0)

# Define your labels
labels = ["cat", "dog", "bird"]

# Generate contextual paragraphs for each label
contextual_paragraphs_gpt_neo = {}
for label in labels:
    prompt = f"Write a descriptive paragraph about {label}."
    response = generator(prompt, max_length=100, num_return_sequences=1,truncation=True)
    contextual_paragraphs_gpt_neo[label] = response[0]['generated_text']

# Print results
for label, text in contextual_paragraphs_gpt_neo.items():
    print(f"{label}: {text}\n")
'''

'\nfrom transformers import pipeline\n\n# Load a GPT-Neo model for text generation\ngenerator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B", device=0)\n\n# Define your labels\nlabels = ["cat", "dog", "bird"]\n\n# Generate contextual paragraphs for each label\ncontextual_paragraphs_gpt_neo = {}\nfor label in labels:\n    prompt = f"Write a descriptive paragraph about {label}."\n    response = generator(prompt, max_length=100, num_return_sequences=1,truncation=True)\n    contextual_paragraphs_gpt_neo[label] = response[0][\'generated_text\']\n\n# Print results\nfor label, text in contextual_paragraphs_gpt_neo.items():\n    print(f"{label}: {text}\n")\n'

In [6]:
'''
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the NuExtract-1.5-smol model and tokenizer
model_name = "numind/NuExtract-1.5-smol"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()

# Function to generate text for a label
def generate_paragraph(label):
    prompt = f"Write a detailed and descriptive paragraph about {label}."
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# List of labels
labels = ["cat", "dog", "bird"]

# Generate paragraphs for each label
contextual_paragraphs_numind = {label: generate_paragraph(label) for label in labels}

# Print the generated paragraphs
for label, paragraph in contextual_paragraphs_numind.items():
    print(f"{label}: {paragraph}\n")
'''

'\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n\n# Load the NuExtract-1.5-smol model and tokenizer\nmodel_name = "numind/NuExtract-1.5-smol"\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\nmodel = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).cuda()\n\n# Function to generate text for a label\ndef generate_paragraph(label):\n    prompt = f"Write a detailed and descriptive paragraph about {label}."\n    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")\n    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)\n    return tokenizer.decode(outputs[0], skip_special_tokens=True)\n\n# List of labels\nlabels = ["cat", "dog", "bird"]\n\n# Generate paragraphs for each label\ncontextual_paragraphs_numind = {label: generate_paragraph(label) for label in labels}\n\n# Print the generated paragraphs\nfor label, paragraph in contextual_paragraphs_numind.items():\n    print(f"{label}: {paragraph}\n"

#  Prepare Corpus for Word Embedding

Combine the generated paragraphs into a single corpus

In [11]:
corpus = "\n".join(contextual_paragraphs_Qwen1_5B.values())
print(corpus)


Write a descriptive paragraph about hypolimnas_misippus. Hypolimnas misippus, commonly known as the blue morpho butterfly, is a species of large, colorful butterflies native to South America. With its iridescent wings that can change color in response to light and temperature, this butterfly is often mistaken for being electrically charged due to its striking appearance. The blue morpho's wings are typically emerald green on top with white spots, but when viewed from below, they appear black. This unique trait has earned it the nickname "blue diamond." In addition to its beautiful colors, the blue morpho is also known for its graceful flight and impressive size - adults can grow up to 12 inches across their wingspan! These butterflies play an important role in their ecosystems by pollinating plants and serving as food sources for other animals like hummingbirds. Despite their remarkable beauty, blue morphos face threats such as habitat loss and pollution, making conservation efforts cr

# Generate Word Embeddings

Word2Vec Embeddings

In [12]:
# Tokenize the corpus into sentences
sentences = [word_tokenize(paragraph.lower()) for paragraph in contextual_paragraphs_Qwen1_5B.values()]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=120, window=5, min_count=1, workers=4)

# Retrieve embeddings for each label
label_embeddings = {label: word2vec_model.wv[label] for label in labels}
print(label_embeddings)

{'hypolimnas_misippus': array([-7.6333149e-03, -7.7996706e-03, -2.0560329e-03,  1.0215141e-03,
        1.3201023e-03,  3.0016256e-03,  3.7081386e-03,  7.1118888e-03,
       -4.8662005e-03, -4.2487942e-03,  4.2960742e-03, -7.4735694e-03,
        2.8681210e-03,  8.3933660e-04, -3.2064824e-03,  6.8684947e-04,
        3.2937936e-03,  2.1991190e-03,  4.7882157e-03, -6.0123801e-03,
        5.7936842e-03, -3.4141773e-03,  7.3228292e-03,  7.2121709e-03,
       -7.7867541e-03,  6.1797309e-03,  2.8368186e-03,  3.1188987e-03,
        5.9997444e-03,  4.9810493e-03,  1.3962602e-03, -2.1451523e-03,
        1.4740645e-03,  6.9638025e-03, -1.6796138e-03, -7.3717716e-03,
       -9.4865519e-04, -5.9169913e-03, -6.8704663e-03, -7.2172307e-03,
       -2.8552175e-03, -5.4761269e-03, -3.8869588e-03,  5.3134379e-03,
        5.1253149e-03,  6.0747047e-03,  6.2013888e-03,  3.1099261e-03,
        1.0900128e-03,  1.8230838e-03, -1.0410015e-03,  4.3868972e-03,
       -4.9301274e-03,  3.3812586e-03, -2.6620743e-03

In [13]:
'''

# Get the vocabulary list
vocabulary_list = list(word2vec_model.wv.key_to_index.keys())

# Print the vocabulary list
print(vocabulary_list)# Get the vocabulary list
vocabulary_list = list(word2vec_model.wv.key_to_index.keys())

# Print the vocabulary list
print(vocabulary_list)

'''

'\n\n# Get the vocabulary list\nvocabulary_list = list(word2vec_model.wv.key_to_index.keys())\n\n# Print the vocabulary list\nprint(vocabulary_list)# Get the vocabulary list\nvocabulary_list = list(word2vec_model.wv.key_to_index.keys())\n\n# Print the vocabulary list\nprint(vocabulary_list)\n\n'

BERT Embeddings

In [14]:
# Load a pre-trained BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each label's paragraph
label_embeddings_b = {label: bert_model.encode(paragraph) for label, paragraph in contextual_paragraphs_Qwen1_5B.items()}
print(label_embeddings_b)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{'hypolimnas_misippus': array([ 4.14610915e-02,  5.21146208e-02,  3.58817540e-02,  6.40311092e-02,
        1.40659418e-02, -1.63089205e-02,  3.59961763e-02, -3.97546068e-02,
       -4.54492122e-02,  1.13856895e-02,  1.34780239e-02, -8.98574814e-02,
       -2.87053958e-02, -2.40782443e-02,  1.95219542e-03,  1.64830387e-01,
        4.93609011e-02, -5.15986979e-02, -7.88304955e-02,  4.75408398e-02,
        4.41292450e-02, -2.12167576e-02, -2.23761369e-02,  1.12880580e-01,
       -9.25367028e-02,  1.38687203e-02, -6.05774261e-02,  6.33324832e-02,
        4.18119272e-03, -7.48881847e-02, -7.35384151e-02,  2.37782653e-02,
        1.65671986e-02, -3.42027587e-03, -3.19723086e-03, -2.51365043e-02,
        2.34484673e-02, -3.51668000e-02, -3.78954806e-03, -1.77503098e-02,
        3.14864144e-02,  5.66404015e-02, -2.69242693e-02,  2.51816139e-02,
       -6.30908310e-02,  3.85745130e-02,  1.06307082e-02,  5.19099832e-02,
       -8.30636360e-03, -5.40148839e-02, -5.36411963e-02, -7.74938464e-02,
 

In [15]:
# Aggregate embeddings by averaging
final_vector = np.mean(list(label_embeddings.values()), axis=0)
print("Final Word2Vec Vector Shape:", final_vector.shape)

Final Word2Vec Vector Shape: (120,)


In [16]:
# Aggregate embeddings by averaging
final_vector = np.mean(list(label_embeddings_b.values()), axis=0)
print("Final BERT Vector Shape:", final_vector.shape)

Final BERT Vector Shape: (384,)


In [17]:
'''
# Save final vector
np.save("final_vector.npy", final_vector)
'''

'\n# Save final vector\nnp.save("final_vector.npy", final_vector)\n'

In [18]:
'''
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Extract embeddings
embeddings = list(label_embeddings.values())
label_names = list(label_embeddings.keys())

# Reduce dimensionality
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot the embeddings
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], marker='o')

for i, label in enumerate(label_names):
    plt.annotate(label, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))

plt.title("Label Embeddings Visualization")
plt.show()
'''

'\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt\n\n# Extract embeddings\nembeddings = list(label_embeddings.values())\nlabel_names = list(label_embeddings.keys())\n\n# Reduce dimensionality\npca = PCA(n_components=2)\nreduced_embeddings = pca.fit_transform(embeddings)\n\n# Plot the embeddings\nplt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], marker=\'o\')\n\nfor i, label in enumerate(label_names):\n    plt.annotate(label, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))\n\nplt.title("Label Embeddings Visualization")\nplt.show()\n'