In [None]:
import json
from tqdm import tqdm
import os

from google.colab import files
uploaded = files.upload()  # Upload your service account JSON here

import firebase_admin
from firebase_admin import credentials, firestore

# Replace 'your-service-account.json' with your uploaded file name
cred = credentials.Certificate('ai-alumni-chatbot-36a8abc75502.json')
firebase_admin.initialize_app(cred)

db = firestore.client()



Saving ai-alumni-chatbot-36a8abc75502.json to ai-alumni-chatbot-36a8abc75502.json


In [None]:
import re

# --- Smart Q&A Generator Function ---
def generate_smart_qa_pairs(profile):
    qa_pairs = []

    name = profile.get("Name", "The alumni")
    email = profile.get("Email", None)
    phone = profile.get("Phone", None)
    location = profile.get("Location", None)
    skills = profile.get("Skills", [])
    work_experience = profile.get("WorkExperience", [])
    education = profile.get("Education", [])
    projects = profile.get("Projects", [])  # Optional, if available

    # --- From Email, Phone, Location ---
    if email:
        qa_pairs.append({
            "question": f"What is the email address of {name}?",
            "answer": email
        })

    if phone:
        qa_pairs.append({
            "question": f"What is the phone number of {name}?",
            "answer": phone
        })

    if location:
        qa_pairs.append({
            "question": f"Where is {name} located?",
            "answer": location
        })

    # --- From Skills ---
    if skills:
        skills_list = ", ".join(skills)
        qa_pairs.append({
            "question": f"What skills does {name} have?",
            "answer": skills_list
        })
        for skill in skills:
            qa_pairs.append({
                "question": f"Which alumni are skilled in {skill}?",
                "answer": name
            })

    # --- From Work Experience (Corrected) ---
    if work_experience:
        idx = 0
        while idx < len(work_experience):
            title_line = work_experience[idx]

            # Check if title_line looks like "JobTitle (YYYY-MM-DD - YYYY-MM-DD)"
            match = re.search(r"\((\d{4})-\d{2}-\d{2} - (\d{4})-\d{2}-\d{2}\)", title_line)
            if match:
                start_year = match.group(1)
                end_year = match.group(2)

                if idx + 1 < len(work_experience):
                    company = work_experience[idx + 1]

                    qa_pairs.append({
                        "question": f"Where did {name} work?",
                        "answer": f"{company} ({start_year} to {end_year})"
                    })
                    qa_pairs.append({
                        "question": f"Where did {name} work previously?",
                        "answer": f"{company} ({start_year} to {end_year})"
                    })
                    qa_pairs.append({
                        "question": f"In which years did {name} work at {company}?",
                        "answer": f"{start_year} to {end_year}"
                    })
                    qa_pairs.append({
                        "question": f"Who worked at {company} between {start_year} and {end_year}?",
                        "answer": name
                    })

                idx += 2  # Move forward 2 lines
            else:
                idx += 1  # Move forward normally
        return qa_pairs



    # --- From Education ---
    # --- From Education (Enhanced with school, degree and years) ---
    if education:
        idx = 0
        while idx < len(education):
            school_line = education[idx]

            # Check if school_line matches "School (YYYY-MM-DD - YYYY-MM-DD)"
            match = re.search(r"\((\d{4})-\d{2}-\d{2} - (\d{4})-\d{2}-\d{2}\)", school_line)
            if match:
                start_year = match.group(1)
                end_year = match.group(2)
                school_name = school_line.split('(')[0].strip()  # Take part before '('

                # Next line should be Degree Name
                if idx + 1 < len(education):
                    degree_name = education[idx + 1]

                    # --- Generate rich Q&A ---
                    if school_name:
                        qa_pairs.append({
                            "question": f"Who studied at {school_name}?",
                            "answer": f"{name} ({start_year} - {end_year})"
                        })
                        qa_pairs.append({
                            "question": f"Where did {name} study?",
                            "answer": f"{school_name} ({start_year} - {end_year})"
                        })

                    if degree_name:
                        qa_pairs.append({
                            "question": f"Who completed {degree_name}?",
                            "answer": f"{name} ({start_year} - {end_year})"
                        })
                        qa_pairs.append({
                            "question": f"What degree did {name} complete?",
                            "answer": f"{degree_name}"
                        })

                idx += 3  # Move forward by 3 (School + Degree + GPA)
            else:
                idx += 1


# --- Function to Create JSONL for Fine-tuning ---
def create_smart_fine_tuning_dataset(output_file):
    profiles = db.collection('alumni_profiles').stream()

    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f_out:
        for profile in tqdm(profiles, desc="Generating Smart Q&A Pairs"):
            data = profile.to_dict()
            qa_pairs = generate_smart_qa_pairs(data)

            for qa in qa_pairs:
                record = {
                    "messages": [
                        {"role": "user", "content": qa["question"]},
                        {"role": "assistant", "content": qa["answer"]}
                    ]
                }
                f_out.write(json.dumps(record) + "\n")  # Each Q&A as one line

    print(f"\n🎯 Smart fine-tuning dataset generated at: {output_file}")

# --- Main Execution ---
output_path = "/content/alumni_smart_fine_tuning_dataset.jsonl"  # Save path
create_smart_fine_tuning_dataset(output_path)


Generating Smart Q&A Pairs: 3543it [00:03, 1073.80it/s]


🎯 Smart fine-tuning dataset generated at: /content/alumni_smart_fine_tuning_dataset.jsonl



