In [1]:
import json 
import os 
import gzip
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
from datasets import load_dataset
from transformers import GPT2Tokenizer
from customTransformers import DecodeTransformer 
from utils.common import save_file_text, read_file_text

  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.10.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained(
    "gpt2",
    local_files_only=True
)
tokenizer.pad_token = tokenizer.eos_token

In [2]:
ds = load_dataset("roneneldan/TinyStories", split="train")

Mid Training QA + generative

In [3]:
REAL_NAMES = [
  "Tim",
  "Tom",
  "Sam",
  "Bob",
  "Ben",
  "Max",
  "Jack",
  "Leo",
  "Alex",
  "Anna",
  "Amy",
  "Emma",
  "Lily",
  "Lucy",
  "Mia",
  "Ella",
  "Sarah",
  "John",
  "Mary"
]
DATASET_PATH = "../CustomDatasets/story.json"

In [4]:
def generate_qa_from_story(story: str):
    qas = []

    sentences = story.split(".")
    sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
    list = {}  

    for s in sentences:
        tokens = s.split()
        if len(tokens) < 3:
            continue 

        name = tokens[0]

        if name not in REAL_NAMES:
            continue

        if name.istitle():
            qas.append({
                "q": f"who is {name.lower()} ?",
                "a": s.strip() + "."
            })

        if "is" in tokens or "was" in tokens:
            qas.append({
                "q": f"what is {name.lower()} doing ?",
                "a": s.strip() + "."
            })

    return qas


def convert_tinystories(dataset, max_samples=50_000):
    output = []

    for ex in dataset:
        story = ex["text"].strip()
        qa = generate_qa_from_story(story)

        if len(qa) == 0:
            continue

        output.append({
            "story": story,
            "qa": qa
        })

        if len(output) >= max_samples:
            break

    return output

In [5]:
storyqa = convert_tinystories(ds, max_samples=100000)

In [7]:
# assert not os.path.exists(DATASET_PATH) , "Not"
if not os.path.exists(DATASET_PATH): 
    print("Packing")
    save_file_text(storyqa, DATASET_PATH)

Packing


In [8]:
storyqa_data = read_file_text(DATASET_PATH)

In [None]:
len(storyqa_data) , storyqa_data[:10]

In [12]:
import json
import random

random.seed(42)

def normalize(text):
    return " ".join(text.strip().lower().split())

def build_sft(): 
    data = read_file_text(DATASET_PATH)

    sft = []
    seen_pairs = set()

    for ex in data:
        for qa in ex["qa"]:
            pair_key = (
                normalize(qa["q"]),
                normalize(qa["a"])
            )

            if pair_key in seen_pairs:
                continue

            seen_pairs.add(pair_key)
            sft.append({
                "prompt": qa["q"].strip(),
                "response": qa["a"].strip()
            })

    UNKNOWN_NAMES = [
        "billy", "alex", "john", "mark", "peter",
        "sarah", "lucas", "james", "emma2", "tom2"
    ]

    for name in UNKNOWN_NAMES:
        for template in [
            f"who is {name} ?",
            f"what is {name} doing ?",
            f"tell me about {name}"
        ]:
            sft.append({
                "prompt": template,
                "response": f"I don't know who {name.capitalize()} is."
            })

    random.shuffle(sft)
    save_file_text(sft, "sft.json")
    print(f"SFT samples: {len(sft)}")
    return sft


In [13]:
sftdata = build_sft()

SFT samples: 271779


In [14]:
len(storyqa_data) , len(sftdata)

(100000, 271779)

In [None]:
sftdata

In [None]:
from pypdf import PdfReader

reader = PdfReader("../../datasets/prasanna.pdf")

text = ""
for page in reader.pages:
    text += page.extract_text()

print(text)

Prasanna Jagadesh
♂phone91+ 6383022025 /envel⌢peprasannnajaga9@gmail.com /linkedinlinkedin /githubgithub /gl⌢bewebsite /gl⌢beLeetcode /gl⌢behackerRank
T echnical Skills
Languages: Java, JavaScript, TypeScript, Rust, Python, Go
Frameworks: Numpy, Pandas, Pytorch, Angular, React.js, Next.js, Tailwind CSS, Bootstrap, Tauri, Spring Boot, Express.js,
FastAPI, Sequel.js, PostgreSQL, MongoDB, DynamoDB, Elasticsearch.
Cloud & DevOps: AWS (API Gateway, Lambda, SES), GCP (Cloud Run, CloudSql, App Engine, Tasks), Git, Docker
T ools & Concepts:Micro Services, REST APIs, Machine learning, Networking, Deep learning.
Experience
Highperformr AI June 2025 – Oct 2025
Software Engineer (Full stack) Chennai, IN
• Implemented a website tracking system similar to RB2B, allowing users to register their domains, include or exclude
pages, and analyze de-anonymized visitor data to understand audiences and reﬁne sales strategies.
• Designed and delivered cutting-edge, conversion-focused web experiences using Web

In [None]:
with open("prasanna.txt", "w") as f:
    f.write(text)

In [11]:
import json
import re

# 1. Configuration: System Prompt and Question Mappings
SYSTEM_PROMPT = "You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills."

# Map resume sections to potential user questions
SECTION_PROMPTS = {
    "HEADER": [
        "Who is Prasanna?",
        "Tell me about Prasanna.",
        "Give me a summary of this candidate."
    ],
    "SKILLS": [
        "What are Prasanna's technical skills?",
        "What programming languages does he know?",
        "List his technical stack."
    ],
    "EXPERIENCE": [
        "Describe Prasanna's work experience.",
        "What is his employment history?",
        "Where has he worked previously?"
    ],
    "PROJECTS": [
        "What projects has Prasanna worked on?",
        "Tell me about his portfolio.",
        "Describe his key projects."
    ],
    "EDUCATION": [
        "What is Prasanna's educational background?",
        "Where did he go to college?",
        "List his degrees."
    ]
}

def extract_sections(text):
    """
    Parses the raw text into a dictionary of sections based on common Resume Headers.
    Assumes headers are in UPPERCASE or Title Case on their own lines.
    """
    sections = {}
    
    # Common headers to look for (adjust based on your specific txt file layout)
    header_patterns = r"(SKILLS|EXPERIENCE|WORK HISTORY|PROJECTS|EDUCATION|ACHIEVEMENTS|CERTIFICATIONS)"
    
    # Split text by these headers
    # The regex captures the delimiter (header) so we can use it as a key
    parts = re.split(f"^{header_patterns}.*$", text, flags=re.MULTILINE | re.IGNORECASE)
    
    # The first part is usually the Name/Contact/Summary (Header)
    sections["HEADER"] = parts[0].strip()
    
    # Iterate through the rest of the parts
    # re.split returns [text_before, header1, text_after1, header2, text_after2...]
    for i in range(1, len(parts), 2):
        header = parts[i].strip().upper()
        content = parts[i+1].strip()
        if content:
            sections[header] = content
            
    return sections

def create_conversation_entry(user_query, assistant_response):
    """
    Helper to format a single training example in the requested JSON structure.
    """
    return [ 
        {"role" :"system" , "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_query},
        {"role": "assistant", "content": assistant_response}
    ]

def generate_dataset(file_path):
    """
    Main function to read the file and generate the dataset.
    """
    dataset = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # Step 1: Parse the text
    sections = extract_sections(raw_text)

    # Step 2: Generate entries for each section
    for section_name, content in sections.items():
        # Clean up content (remove excessive newlines)
        clean_content = " ".join(content.split())
        
        # Match the found section to our prompt list
        # We try to match partial keys (e.g., "WORK EXPERIENCE" matches "EXPERIENCE")
        matched_key = next((key for key in SECTION_PROMPTS if key in section_name), None)
        
        if matched_key:
            # Generate multiple variations for better training
            for question in SECTION_PROMPTS[matched_key]:
                entry = create_conversation_entry(question, clean_content)
                dataset.append({ "messages" : entry})
        
        # Special case: If it is the HEADER, ensure we have the specific "Who is?" question
        if section_name == "HEADER":
             entry = create_conversation_entry("Who is Prasanna?", clean_content)
             dataset.append({ "messages" : entry})
 

    return dataset
 

In [12]:
# Replace with your actual file path
input_file = "prasanna.txt" 
output_file = "prasanna_dataset.json"

try:
    data = generate_dataset(input_file)
    
    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
        
    print(f"Successfully created {len(data)} training examples.")
    print(f"Preview of first entry:\n{json.dumps(data[0], indent=2)}")
    
except FileNotFoundError:
    print("Error: prasanna.txt not found. Please ensure the file is in the directory.")

Successfully created 10 training examples.
Preview of first entry:
{
  "messages": [
    {
      "role": "system",
      "content": "You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills."
    },
    {
      "role": "user",
      "content": "Who is Prasanna?"
    },
    {
      "role": "assistant",
      "content": "Prasanna Jagadesh \u2642phone91+ 6383022025 /envel\u2322peprasannnajaga9@gmail.com /linkedinlinkedin /githubgithub /gl\u2322bewebsite /gl\u2322beLeetcode /gl\u2322behackerRank Technical Skills: Languages: Java, JavaScript, TypeScript, Rust, Python, Go Frameworks: Numpy, Pandas, Pytorch, Angular, React.js, Next.js, Tailwind CSS, Bootstrap, Tauri, Spring Boot, Express.js, FastAPI, Sequel.js, PostgreSQL, MongoDB, DynamoDB, Elasticsearch. Cloud & DevOps: AWS (API Gateway, Lambda, SES), GCP (Cloud Run, CloudSql, App Engine, Tasks), Git, Docker Tools & Concepts:Micro Services, REST APIs, Machine learning, Networking, Deep le

In [15]:
import json
import re
import os

# SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_FILE = "data.txt"
OUTPUT_FILE = "prasanna_data.json"


def parse_data_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        raw = f.read()

    system_match = re.match(r"role:\s*system,\s*\ncontent:\s*(.+?)(?=\n\n)", raw, re.DOTALL)
    system_content = system_match.group(1).strip() if system_match else ""

    blocks = re.split(r"\n\d+\n", raw)
    blocks = blocks[1:]

    dataset = []

    for block in blocks:
        block = block.strip()
        if not block:
            continue

        pairs = re.findall(
            r"role:\s*(user|assistant),\s*\ncontent:\s*(.+?)(?=\nrole:|\Z)",
            block,
            re.DOTALL,
        )

        if len(pairs) < 2:
            continue

        user_content = pairs[0][1].strip()
        assistant_content = pairs[1][1].strip()

        entry = {
            "messages": [
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content},
            ]
        }
        dataset.append(entry)

    return dataset


def main():
    dataset = parse_data_file(INPUT_FILE)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=4, ensure_ascii=False)

    print(f"Converted {len(dataset)} entries → {OUTPUT_FILE}")



main()


Converted 152 entries → prasanna_data.json
