<a href="https://colab.research.google.com/github/SlayerDraco/personal-assistant-LLM/blob/main/personal_assistant_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [9]:
!pip install -q transformers datasets tokenizers accelerate deepspeed

from pathlib import Path

# Create your structure
folders = [
    "config", "tokenizer", "data/raw", "data/processed",
    "training", "model", "inference"
]
for folder in folders:
    Path(folder).mkdir(parents=True, exist_ok=True)

print("✔️ Environment setup complete.")


✔️ Environment setup complete.


Training the tokeniser


In [10]:
Path("data/raw").mkdir(parents=True, exist_ok=True)

sample_text = """
User: Set a reminder every hour to drink water.
AI: Got it. Hourly hydration threats activated.

User: Take a note that I need to submit the assignment tomorrow.
AI: Noted. Should I also remind you five times before the deadline or let you panic naturally?

User: Schedule a call with Mom at 8 PM.
AI: Scheduled. Tell her I said hi and that you're still alive.

User: What's on my agenda today?
AI: Cry a little, do some work, pretend you're fine. Oh, and a meeting at 3 PM.

User: Draft a message for my professor about the late submission.
AI: "Dear Sir, I deeply regret being a walking deadline disaster..." — want me to send that?

User: Tell me a joke.
AI: You. Trying to wake up before 10 AM.

User: Delete my last note.
AI: Gone. Like your motivation.

User: How's the weather?
AI: Perfect for staying in and questioning your life choices.

User: Play some music.
AI: Playing your favorite: Lo-fi beats to cry and code to.

User: Set an alarm for 7 AM.
AI: Set. But we both know you're hitting snooze.
"""

with open("data/raw/sample_conversations.txt", "w") as f:
    f.write(sample_text.strip())

print("✅ Sample dataset created.")


✅ Sample dataset created.


In [11]:
from transformers import GPT2Tokenizer
from pathlib import Path

# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add your custom domain-specific tokens
new_tokens = [
    "<remind>", "<snooze>", "<hydrate>", "<assistant>", "<user>",
    "<sarcasm>", "<panic_mode>", "<cry>", "<procrastinate>"
]
tokenizer.add_tokens(new_tokens)

print(f"✅ Added {len(new_tokens)} new tokens. New vocab size: {len(tokenizer)}")

# Save this upgraded tokenizer
Path("tokenizer").mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained("tokenizer/")


✅ Added 9 new tokens. New vocab size: 50266


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

In [12]:
!pip install nltk wordfreq lemminflect

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.corpus import wordnet as wn
import lemminflect
from lemminflect import getAllInflections
from wordfreq import top_n_list
import random

all_words = set()

# 1. WordNet base words and inflections
for synset in wn.all_synsets():
    for lemma in synset.lemmas():
        word = lemma.name().lower().replace('_', ' ')
        all_words.add(word)

        # Derivational forms
        if lemma.derivationally_related_forms():
            for related in lemma.derivationally_related_forms():
                all_words.add(related.name().lower().replace('_', ' '))

# 2. Lemminflect inflections
print("Expanding via lemminflect...")
temp_words = list(all_words)
for word in temp_words:
    try:
        inflections = getAllInflections(word)
        for form_list in inflections.values():
            for form in form_list:
                all_words.add(form.lower())
    except:
        continue

# 3. Add high-frequency words from wordfreq (English)
print("Adding top words from wordfreq...")
top_words = top_n_list('en', 10000)
all_words.update(top_words)

print(f"✅ Total unique words generated: {len(all_words)}")

# Optional: Save to a .txt file for tokenizer training
with open("mew.txt", "w") as f:
    f.write("\n".join(sorted(all_words)))




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Expanding via lemminflect...
Adding top words from wordfreq...
✅ Total unique words generated: 176210


In [None]:
from transformers import GPT2Tokenizer

# Load your upgraded tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("tokenizer/")

# Load your data
with open("mew.txt", "r") as f:
    text_data = f.read()

# Encode the data
encoded = tokenizer.encode(text_data, add_special_tokens=True)

# Save the encoded data
with open("data/processed/encoded_data.txt", "w") as f:
    f.write(" ".join(map(str, encoded)))

print(f"✅ Encoded {len(encoded)} tokens and saved to `data/processed/encoded_data.txt`")


In [None]:
import numpy as np
from pathlib import Path
from datasets import Dataset

# Paths
encoded_path = "data/processed/encoded_data.txt"
npy_path = "data/processed/encoded_data.npy"
hf_dataset_path = "data/processed/hf_tokenized_dataset"
shard_prefix = "data/processed/shard_"
num_shards = 4  # You can increase this based on your compute

# Step 1: Load token IDs
print("🔄 Loading token IDs from text...")
with open(encoded_path, "r") as f:
    token_ids = [int(line.strip()) for line in f.readlines() if line.strip().isdigit()]

print(f"✅ Loaded {len(token_ids):,} tokens.")

# Step 2: Save as .npy (binary)
print("💾 Saving token IDs as .npy...")
np_array = np.array(token_ids, dtype=np.uint16)  # Use uint32 if vocab is huge
np.save(npy_path, np_array)
print(f"✅ Saved to {npy_path} with shape {np_array.shape}.")

# Step 3: Shard into smaller .npy files
print(f"🔪 Sharding into {num_shards} pieces...")
shard_size = len(np_array) // num_shards
for i in range(num_shards):
    start = i * shard_size
    end = (i + 1) * shard_size if i < num_shards - 1 else len(np_array)
    shard = np_array[start:end]
    np.save(f"{shard_prefix}{i}.npy", shard)
    print(f"   🧩 Shard {i} saved: {len(shard)} tokens.")

# Step 4: Convert to HuggingFace dataset
print("🧠 Converting to HuggingFace Dataset format...")
dataset = Dataset.from_dict({"input_ids": token_ids})
dataset.save_to_disk(hf_dataset_path)
print(f"✅ Saved HuggingFace dataset to {hf_dataset_path}")
