<a href="https://colab.research.google.com/github/Tar-ive/Dashboard/blob/main/brahman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Check out the System Architecture docs here: https://www.overleaf.com/read/ffzgqryyrgzm#22fb04

In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from datetime import datetime
from typing import List, Dict, Optional
import json

# Configuration
TAHIR_EKIN_ID = "A5088154684"  # Tahir Ekin's researcher ID
EMAIL = "your_email@example.com"  # Replace with your actual email
MAX_RESEARCHERS = 50

def call_openalex_api(endpoint, params=None):
    """Make API calls with rate limiting and error handling"""
    base_url = f"https://api.openalex.org/{endpoint}"
    headers = {'User-Agent': f'mailto:{EMAIL}'}

    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        time.sleep(0.2)  # Rate limiting
        return response.json()
    except Exception as e:
        print(f"Error calling {endpoint} API: {str(e)}")
        return None

def get_texas_state_id():
    """Get Texas State University's OpenAlex ID."""
    params = {
        'filter': 'display_name.search:texas state university',
        'per-page': 1
    }
    response = call_openalex_api('institutions', params)
    if response and 'results' in response and response['results']:
        return response['results'][0]['id']
    return None

def fetch_top_researchers(institution_id, max_researchers=50):
    """Fetch the top cited researchers affiliated with an institution."""
    all_researchers = []
    cursor = '*'

    while cursor and len(all_researchers) < max_researchers:
        try:
            params = {
                'filter': f'last_known_institutions.id:{institution_id}',
                'per-page': min(100, max_researchers - len(all_researchers)),
                'sort': 'cited_by_count:desc',
                'cursor': cursor
            }

            response = call_openalex_api('authors', params)

            if not response or 'results' not in response:
                break

            researchers = response['results']
            if not researchers:
                break

            all_researchers.extend(researchers)

            if len(all_researchers) >= max_researchers:
                all_researchers = all_researchers[:max_researchers]
                break

            cursor = response.get('meta', {}).get('next_cursor')

            if not cursor:
                break

        except Exception as e:
            print(f"Error fetching researchers: {str(e)}")
            break

    return all_researchers

def get_researcher_by_id(researcher_id):
    """Get a specific researcher by ID"""
    try:
        clean_id = researcher_id.split('/')[-1] if '/' in researcher_id else researcher_id
        response = call_openalex_api(f'authors/{clean_id}')
        return response
    except Exception as e:
        print(f"Error fetching researcher {researcher_id}: {str(e)}")
        return None

def reconstruct_abstract(inverted_index: Dict) -> str:
    """Reconstruct abstract text from OpenAlex inverted index format."""
    if not inverted_index:
        return ""

    word_positions = []
    for word, positions in inverted_index.items():
        for pos in positions:
            word_positions.append((pos, word))

    word_positions.sort(key=lambda x: x[0])
    words = [word for _, word in word_positions]

    return " ".join(words)

def get_researcher_works(researcher_id):
    """Get all works for a researcher using cursor pagination"""
    clean_id = researcher_id.split('/')[-1] if '/' in researcher_id else researcher_id

    base_params = {
        'filter': f'author.id:{clean_id}',
        'per-page': 200,
        'sort': 'cited_by_count:desc'
    }

    all_works = []
    cursor = '*'

    while cursor:
        try:
            params = base_params.copy()
            params['cursor'] = cursor

            response = call_openalex_api('works', params)

            if not response or 'results' not in response:
                break

            works = response['results']
            if not works:
                break

            all_works.extend(works)

            cursor = response.get('meta', {}).get('next_cursor')

            if not cursor:
                break

        except Exception as e:
            print(f"Error fetching works for researcher {clean_id}: {str(e)}")
            break

    return all_works

def extract_work_data(works: List[Dict], researcher_id: str, researcher_name: str) -> List[Dict]:
    """Extract work data including abstracts, topics, titles, and publication years."""
    extracted_data = []

    for work in works:
        # Extract basic work information
        work_id = work.get('id', '')
        title = work.get('display_name', '')
        publication_year = work.get('publication_year', None)
        doi = work.get('doi', '')
        citations = work.get('cited_by_count', 0) or 0

        # Extract and reconstruct abstract
        abstract_inverted = work.get('abstract_inverted_index', {})
        abstract = reconstruct_abstract(abstract_inverted)

        # Extract topics
        topics = work.get('topics', [])
        topic_names = []
        topic_scores = []

        if topics and isinstance(topics, list):
            for topic in topics[:5]:  # Limit to top 5 topics
                if topic and isinstance(topic, dict):
                    topic_names.append(topic.get('display_name', ''))
                    topic_scores.append(topic.get('score', 0))

        # Join topics with semicolon separator
        topics_str = '; '.join(topic_names) if topic_names else ''
        topic_scores_str = '; '.join([str(score) for score in topic_scores]) if topic_scores else ''

        # Extract source information
        primary_location = work.get('primary_location')
        source_name = ''
        if primary_location and isinstance(primary_location, dict):
            source = primary_location.get('source')
            if source and isinstance(source, dict):
                source_name = source.get('display_name', '')

        # Extract open access information
        oa_info = work.get('open_access', {})
        is_oa = False
        oa_status = ''
        if oa_info and isinstance(oa_info, dict):
            is_oa = oa_info.get('is_oa', False)
            oa_status = oa_info.get('oa_status', '')

        # Extract work type
        work_type = work.get('type', '')

        # Extract concepts (different from topics)
        concepts = work.get('concepts', [])
        concept_names = []
        if concepts and isinstance(concepts, list):
            for concept in concepts[:5]:  # Limit to top 5 concepts
                if concept and isinstance(concept, dict):
                    concept_names.append(concept.get('display_name', ''))

        concepts_str = '; '.join(concept_names) if concept_names else ''

        extracted_data.append({
            'researcher_id': researcher_id,
            'researcher_name': researcher_name,
            'work_id': work_id,
            'title': title,
            'abstract': abstract,
            'topics': topics_str,
            'topic_scores': topic_scores_str,
            'concepts': concepts_str,
            'publication_year': publication_year,
            'work_type': work_type,
            'doi': doi,
            'citations': citations,
            'source_name': source_name,
            'is_open_access': is_oa,
            'oa_status': oa_status,
            'has_abstract': bool(abstract),
            'num_topics': len(topic_names)
        })

    return extracted_data

def main():
    """Main function to extract works data for top Texas State University researchers"""

    print("IMPORTANT: Please update the EMAIL variable with your actual email address!")
    print(f"Current email: {EMAIL}")
    print("=" * 70)

    # Step 1: Get Texas State University ID
    print("Getting Texas State University ID...")
    texas_state_id = get_texas_state_id()
    if not texas_state_id:
        print("Could not find Texas State University ID")
        return

    print(f"Found Texas State University ID: {texas_state_id}")

    # Step 2: Fetch top researchers
    print(f"\nFetching top {MAX_RESEARCHERS} cited researchers from Texas State University...")
    researchers = fetch_top_researchers(texas_state_id, MAX_RESEARCHERS)

    if not researchers:
        print("Failed to fetch researchers")
        return

    print(f"Found {len(researchers)} researchers")

    # Step 3: Check if Tahir Ekin is in the list, if not add him
    researcher_ids = [r['id'] for r in researchers]
    tahir_ekin_included = any(TAHIR_EKIN_ID in rid for rid in researcher_ids)

    if not tahir_ekin_included:
        print(f"\nTahir Ekin (ID: {TAHIR_EKIN_ID}) not in top {MAX_RESEARCHERS}, adding him...")
        tahir_ekin_data = get_researcher_by_id(TAHIR_EKIN_ID)
        if tahir_ekin_data:
            researchers.append(tahir_ekin_data)
            print("Tahir Ekin added successfully")
        else:
            print("Failed to fetch Tahir Ekin's data")
    else:
        print(f"Tahir Ekin is already in the top {MAX_RESEARCHERS} researchers")

    # Step 4: Process each researcher and get their works
    all_works_data = []

    print(f"\nProcessing {len(researchers)} researchers and their works...")

    for i, researcher in enumerate(tqdm(researchers, desc="Processing researchers")):
        try:
            researcher_id = researcher['id']
            researcher_name = researcher['display_name']

            print(f"\nProcessing: {researcher_name} ({researcher_id})")

            # Get all works for this researcher
            works = get_researcher_works(researcher_id)

            if works:
                print(f"Found {len(works)} works for {researcher_name}")

                # Extract work data
                work_data = extract_work_data(works, researcher_id, researcher_name)
                all_works_data.extend(work_data)

                print(f"Extracted data for {len(work_data)} works")
            else:
                print(f"No works found for {researcher_name}")

        except Exception as e:
            print(f"Error processing researcher {researcher.get('display_name', 'Unknown')}: {str(e)}")
            continue

    # Step 5: Create DataFrame and save to CSV
    if all_works_data:
        print(f"\nCreating DataFrame with {len(all_works_data)} total works...")
        df = pd.DataFrame(all_works_data)

        # Save to CSV
        filename = f"texas_state_top_{MAX_RESEARCHERS}_researchers_works.csv"
        df.to_csv(filename, index=False)

        print(f"\nData saved to: {filename}")

        # Display summary statistics
        print("\n" + "="*50)
        print("SUMMARY STATISTICS")
        print("="*50)

        print(f"Total researchers processed: {df['researcher_id'].nunique()}")
        print(f"Total works: {len(df)}")
        print(f"Works with abstracts: {df['has_abstract'].sum()}")
        print(f"Works without abstracts: {(~df['has_abstract']).sum()}")
        print(f"Total citations: {df['citations'].sum():,}")
        print(f"Average citations per work: {df['citations'].mean():.1f}")
        print(f"Open access works: {df['is_open_access'].sum()}")

        # Year distribution
        valid_years = df[df['publication_year'].notna() & (df['publication_year'] > 0)]
        if not valid_years.empty:
            print(f"Publication year range: {valid_years['publication_year'].min():.0f} - {valid_years['publication_year'].max():.0f}")

        # Top researchers by total works
        print(f"\nTop 10 researchers by number of works:")
        researcher_counts = df['researcher_name'].value_counts().head(10)
        for name, count in researcher_counts.items():
            print(f"  {name}: {count} works")

        # Check if Tahir Ekin is included
        tahir_works = df[df['researcher_name'].str.contains('Tahir', case=False, na=False)]
        if not tahir_works.empty:
            print(f"\nTahir Ekin's works: {len(tahir_works)}")

        print(f"\nDataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")

        return df

    else:
        print("No works data extracted")
        return None

# Run the extraction
if __name__ == "__main__":
    print("Texas State University Researchers Works Extractor")
    print("=" * 50)
    print("This script will:")
    print(f"1. Get top {MAX_RESEARCHERS} cited researchers from Texas State University")
    print("2. Include Tahir Ekin even if not in top 50")
    print("3. Extract all works data for each researcher")
    print("4. Save combined data to CSV file")
    print("=" * 50)

    df = main()

    if df is not None:
        print(f"\n✅ Success! Data extracted and saved.")
        print(f"📊 {len(df)} total works from {df['researcher_id'].nunique()} researchers")
    else:
        print("\n❌ Failed to extract data")

In [None]:
df.columns

This CSV file contains comprehensive data about every published work (research papers, articles, etc.) from the top 50 most-cited researchers at Texas State University, plus Tahir Ekin's works. Each row represents a single publication with details including the work's title, reconstructed abstract, research topics with scores, publication year, citation count, source journal/venue, and open access status, along with the researcher's name and ID who authored it.

(9086, 17)-> Shape

Index(['researcher_id', 'researcher_name', 'work_id', 'title', 'abstract',
       'topics', 'topic_scores', 'concepts', 'publication_year', 'work_type',
       'doi', 'citations', 'source_name', 'is_open_access', 'oa_status',
       'has_abstract', 'num_topics'],
      dtype='object') -> columns

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import joblib
import json
from tqdm import tqdm
from datetime import datetime
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

class ResearcherProfileDatastore:
    """
    A class to create and manage researcher profile embeddings and metadata.

    Storage Structure:
    ├── researcher_profiles_metadata.parquet     # Main metadata with recency weights
    ├── researcher_embeddings.npy               # All embeddings as numpy array (N x 384)
    ├── embedding_index.json                    # Maps work_id -> array position
    ├── researcher_index.json                   # Maps researcher_id -> list of work_ids
    └── datastore_info.json                     # Metadata about the datastore
    """

    def __init__(self, drive_path="/content/drive/My Drive/datastore"):
        self.drive_path = drive_path
        self.model = None
        self.current_year = datetime.now().year

    def load_model(self, model_name='all-MiniLM-L6-v2'):
        """
        Load the sentence transformer model.
        Using 'all-MiniLM-L6-v2': 384 dimensions, good balance of speed and quality.
        """
        print(f"Loading sentence transformer model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print(f"Model loaded. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        return self.model

    def calculate_recency_weight(self, publication_year):
        """
        Calculate recency weight: Wt = max(0, 1 - (CurrentYear - PublicationYear) / 10)

        Args:
            publication_year (int): Year of publication

        Returns:
            float: Recency weight between 0 and 1
        """
        if pd.isna(publication_year) or publication_year == 0:
            return 0.0

        weight = max(0, 1 - (self.current_year - publication_year) / 10)
        return round(weight, 4)

    def create_text_for_embedding(self, title, abstract):
        """
        Combine title and abstract for embedding.

        Args:
            title (str): Paper title
            abstract (str): Paper abstract

        Returns:
            str: Combined text for embedding
        """
        title = str(title) if pd.notna(title) else ""
        abstract = str(abstract) if pd.notna(abstract) else ""

        # Combine title and abstract with separator
        if abstract:
            return f"{title}. {abstract}"
        else:
            return title

    def process_papers(self, csv_file_path):
        """
        Process all papers from CSV file to create embeddings and metadata.

        Args:
            csv_file_path (str): Path to the CSV file with researcher works

        Returns:
            tuple: (metadata_df, embeddings_array, embedding_index, researcher_index)
        """
        print("Loading data from CSV...")
        df = pd.read_csv(csv_file_path)
        print(f"Loaded {len(df)} papers from {df['researcher_id'].nunique()} researchers")

        # Calculate recency weights
        print("Calculating recency weights...")
        df['recency_weight'] = df['publication_year'].apply(self.calculate_recency_weight)

        # Prepare text for embeddings
        print("Preparing text for embeddings...")
        df['embedding_text'] = df.apply(
            lambda row: self.create_text_for_embedding(row['title'], row['abstract']),
            axis=1
        )

        # Filter out papers with no text
        valid_papers = df[df['embedding_text'].str.len() > 0].copy()
        print(f"Processing {len(valid_papers)} papers with valid text")

        if len(valid_papers) == 0:
            raise ValueError("No papers with valid text found!")

        # Load model if not already loaded
        if self.model is None:
            self.load_model()

        # Generate embeddings
        print("Generating embeddings...")
        texts = valid_papers['embedding_text'].tolist()

        # Process in batches to avoid memory issues
        batch_size = 100
        all_embeddings = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Creating embeddings"):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=False)
            all_embeddings.append(batch_embeddings)

        # Combine all embeddings
        embeddings_array = np.vstack(all_embeddings)
        print(f"Generated embeddings shape: {embeddings_array.shape}")

        # Create embedding index (work_id -> array position)
        embedding_index = {}
        researcher_index = {}

        for idx, (_, row) in enumerate(valid_papers.iterrows()):
            work_id = row['work_id']
            researcher_id = row['researcher_id']

            # Map work_id to embedding position
            embedding_index[work_id] = idx

            # Group by researcher_id
            if researcher_id not in researcher_index:
                researcher_index[researcher_id] = []
            researcher_index[researcher_id].append(work_id)

        print(f"Created embedding index for {len(embedding_index)} papers")
        print(f"Created researcher index for {len(researcher_index)} researchers")

        return valid_papers, embeddings_array, embedding_index, researcher_index

    def save_datastore(self, metadata_df, embeddings_array, embedding_index, researcher_index):
        """
        Save all components of the researcher profile datastore.

        Args:
            metadata_df (pd.DataFrame): Paper metadata with recency weights
            embeddings_array (np.ndarray): All embeddings
            embedding_index (dict): work_id -> array position mapping
            researcher_index (dict): researcher_id -> list of work_ids mapping
        """
        print("Saving researcher profile datastore...")

        # Ensure the datastore directory exists
        os.makedirs(self.drive_path, exist_ok=True)

        # Save metadata as Parquet (much faster than CSV)
        metadata_path = os.path.join(self.drive_path, "researcher_profiles_metadata.parquet")
        metadata_df.to_parquet(metadata_path, index=False)
        print(f"✅ Saved metadata: {metadata_path}")

        # Save embeddings as numpy array
        embeddings_path = os.path.join(self.drive_path, "researcher_embeddings.npy")
        np.save(embeddings_path, embeddings_array)
        print(f"✅ Saved embeddings: {embeddings_path}")

        # Save embedding index
        embedding_index_path = os.path.join(self.drive_path, "embedding_index.json")
        with open(embedding_index_path, 'w') as f:
            json.dump(embedding_index, f, indent=2)
        print(f"✅ Saved embedding index: {embedding_index_path}")

        # Save researcher index
        researcher_index_path = os.path.join(self.drive_path, "researcher_index.json")
        with open(researcher_index_path, 'w') as f:
            json.dump(researcher_index, f, indent=2)
        print(f"✅ Saved researcher index: {researcher_index_path}")

        # Save datastore info
        datastore_info = {
            "created_at": datetime.now().isoformat(),
            "total_papers": len(metadata_df),
            "total_researchers": metadata_df['researcher_id'].nunique(),
            "embedding_dimensions": embeddings_array.shape[1],
            "model_used": "all-MiniLM-L6-v2",
            "current_year_for_recency": self.current_year,
            "files": {
                "metadata": "researcher_profiles_metadata.parquet",
                "embeddings": "researcher_embeddings.npy",
                "embedding_index": "embedding_index.json",
                "researcher_index": "researcher_index.json"
            },
            "usage_instructions": {
                "load_metadata": f"pd.read_parquet('{self.drive_path}/researcher_profiles_metadata.parquet')",
                "load_embeddings": f"np.load('{self.drive_path}/researcher_embeddings.npy')",
                "load_indices": f"json.load(open('{self.drive_path}/embedding_index.json'))",
                "get_embedding_by_work_id": "embeddings[embedding_index[work_id]]",
                "get_researcher_papers": "researcher_index[researcher_id]"
            }
        }

        info_path = os.path.join(self.drive_path, "datastore_info.json")
        with open(info_path, 'w') as f:
            json.dump(datastore_info, f, indent=2)
        print(f"✅ Saved datastore info: {info_path}")

        return datastore_info

    def load_datastore(self):
        """
        Load the complete datastore for future use.

        Returns:
            tuple: (metadata_df, embeddings_array, embedding_index, researcher_index, datastore_info)
        """
        print("Loading researcher profile datastore...")

        # Load metadata
        metadata_path = os.path.join(self.drive_path, "researcher_profiles_metadata.parquet")
        metadata_df = pd.read_parquet(metadata_path)

        # Load embeddings
        embeddings_path = os.path.join(self.drive_path, "researcher_embeddings.npy")
        embeddings_array = np.load(embeddings_path)

        # Load indices
        embedding_index_path = os.path.join(self.drive_path, "embedding_index.json")
        with open(embedding_index_path, 'r') as f:
            embedding_index = json.load(f)

        researcher_index_path = os.path.join(self.drive_path, "researcher_index.json")
        with open(researcher_index_path, 'r') as f:
            researcher_index = json.load(f)

        # Load info
        info_path = os.path.join(self.drive_path, "datastore_info.json")
        with open(info_path, 'r') as f:
            datastore_info = json.load(f)

        print(f"✅ Loaded datastore with {len(metadata_df)} papers and {embeddings_array.shape} embeddings")

        return metadata_df, embeddings_array, embedding_index, researcher_index, datastore_info

    def get_researcher_embeddings(self, researcher_id, embedding_index, researcher_index, embeddings_array):
        """
        Get all embeddings for a specific researcher.

        Args:
            researcher_id (str): The researcher ID
            embedding_index (dict): work_id -> position mapping
            researcher_index (dict): researcher_id -> work_ids mapping
            embeddings_array (np.ndarray): All embeddings

        Returns:
            np.ndarray: Embeddings for the researcher's papers
        """
        if researcher_id not in researcher_index:
            return np.array([])

        work_ids = researcher_index[researcher_id]
        positions = [embedding_index[work_id] for work_id in work_ids if work_id in embedding_index]

        return embeddings_array[positions]

    def display_summary(self, metadata_df, datastore_info):
        """Display summary statistics of the created datastore."""
        print("\n" + "="*60)
        print("RESEARCHER PROFILE DATASTORE SUMMARY")
        print("="*60)

        print(f"📊 Total papers: {len(metadata_df):,}")
        print(f"👥 Total researchers: {metadata_df['researcher_id'].nunique()}")
        print(f"🧠 Embedding dimensions: {datastore_info['embedding_dimensions']}")
        print(f"📅 Current year (for recency): {datastore_info['current_year_for_recency']}")

        print(f"\n📝 Papers with abstracts: {metadata_df['has_abstract'].sum():,}")
        print(f"📄 Papers without abstracts: {(~metadata_df['has_abstract']).sum():,}")

        print(f"\n⚡ Avg recency weight: {metadata_df['recency_weight'].mean():.3f}")
        print(f"📊 Recency weight distribution:")
        print(f"   High (>0.8): {(metadata_df['recency_weight'] > 0.8).sum():,} papers")
        print(f"   Medium (0.4-0.8): {((metadata_df['recency_weight'] > 0.4) & (metadata_df['recency_weight'] <= 0.8)).sum():,} papers")
        print(f"   Low (0-0.4): {(metadata_df['recency_weight'] <= 0.4).sum():,} papers")

        # Top researchers by number of papers
        top_researchers = metadata_df['researcher_name'].value_counts().head(5)
        print(f"\n🔬 Top 5 researchers by paper count:")
        for name, count in top_researchers.items():
            print(f"   {name}: {count} papers")

def main():
    """Main function to create the researcher profile datastore."""

    # Initialize the datastore
    datastore = ResearcherProfileDatastore()

    # Specify your CSV file path (update this!)
    csv_file_path = "/content/texas_state_top_50_researchers_works.csv"

    print("🚀 Creating Researcher Profile Datastore")
    print("="*50)

    try:
        # Process papers and create embeddings
        metadata_df, embeddings_array, embedding_index, researcher_index = datastore.process_papers(csv_file_path)

        # Save everything
        datastore_info = datastore.save_datastore(metadata_df, embeddings_array, embedding_index, researcher_index)

        # Display summary
        datastore.display_summary(metadata_df, datastore_info)

        print(f"\n✅ SUCCESS! Researcher Profile Datastore created.")
        print(f"📁 Files saved to: {datastore.drive_path}")

        return datastore_info

    except Exception as e:
        print(f"❌ ERROR: {str(e)}")
        return None

# Example usage functions
def example_usage():
    """Show how to use the datastore after it's created."""

    print("\n" + "="*60)
    print("EXAMPLE: HOW TO USE THE DATASTORE")
    print("="*60)

    code_examples = """
# 1. Load the complete datastore
datastore = ResearcherProfileDatastore()
metadata_df, embeddings, embedding_index, researcher_index, info = datastore.load_datastore()

# 2. Get embedding for a specific paper
work_id = "https://openalex.org/W1234567890"
if work_id in embedding_index:
    paper_embedding = embeddings[embedding_index[work_id]]
    print(f"Embedding shape: {paper_embedding.shape}")

# 3. Get all papers for a researcher
researcher_id = "https://openalex.org/A5088154684"  # Tahir Ekin
if researcher_id in researcher_index:
    work_ids = researcher_index[researcher_id]
    researcher_papers = metadata_df[metadata_df['work_id'].isin(work_ids)]
    print(f"Researcher has {len(researcher_papers)} papers")

# 4. Get embeddings for a researcher's papers
researcher_embeddings = datastore.get_researcher_embeddings(
    researcher_id, embedding_index, researcher_index, embeddings
)
print(f"Researcher embeddings shape: {researcher_embeddings.shape}")

# 5. Find papers with high recency weights
recent_papers = metadata_df[metadata_df['recency_weight'] > 0.8]
print(f"Found {len(recent_papers)} recent papers")

# 6. Calculate weighted average embedding for a researcher
if len(researcher_embeddings) > 0:
    weights = researcher_papers['recency_weight'].values
    weighted_avg = np.average(researcher_embeddings, axis=0, weights=weights)
    print(f"Weighted average embedding shape: {weighted_avg.shape}")
"""

    print(code_examples)

if __name__ == "__main__":
    # Create the datastore
    result = main()

    if result:
        # Show usage examples
        example_usage()

Implementation Summary:
Why: We chose a file-based Google Drive approach for zero setup complexity, allowing immediate MVP development without cloud infrastructure, billing, or authentication overhead.

How: Stores paper metadata in Parquet format, 384-dimensional embeddings as NumPy arrays, and creates JSON indices mapping work_id→array_position and researcher_id→work_ids for fast lookups.

Cons: Hits memory/performance walls around 10k-50k papers due to loading entire embedding arrays into RAM, lacks similarity search optimization, and has no concurrent access or query filtering capabilities.

Other Issues: Google Drive has 15GB storage limits and slow transfer speeds, JSON parsing becomes expensive with large indices, no automatic backup/versioning, and the monolithic file structure makes partial updates impossible.

Migration Path: Will need to move to proper vector database (Pinecone/Weaviate) or PostgreSQL with pgvector extension when scaling beyond prototype phase.

In [None]:
# prompt: create a new folder in drive called datastrore and store all these files inside there and print new paths.
# ✅ Saved metadata: /content/drive/My Drive/researcher_profiles_metadata.parquet
# ✅ Saved embeddings: /content/drive/My Drive/researcher_embeddings.npy
# ✅ Saved embedding index: /content/drive/My Drive/embedding_index.json
# ✅ Saved researcher index: /content/drive/My Drive/researcher_index.json
# ✅ Saved datastore info: /content/drive/My Drive/datastore_info.json

# Create the new folder if it doesn't exist
import os
new_folder_path = "/content/drive/My Drive/datastore"
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)
    print(f"Created new folder: {new_folder_path}")

# Define the old and new paths
old_paths = [
    "/content/drive/My Drive/researcher_profiles_metadata.parquet",
    "/content/drive/My Drive/researcher_embeddings.npy",
    "/content/drive/My Drive/embedding_index.json",
    "/content/drive/My Drive/researcher_index.json",
    "/content/drive/My Drive/datastore_info.json"
]

new_paths = []

# Move each file and store the new path
for old_path in old_paths:
    filename = os.path.basename(old_path)
    new_path = os.path.join(new_folder_path, filename)

    # Check if the file exists before attempting to move
    if os.path.exists(old_path):
        os.rename(old_path, new_path)
        print(f"Moved '{old_path}' to '{new_path}'")
        new_paths.append(new_path)
    else:
        print(f"File not found, cannot move: {old_path}")
        # Append the intended new path even if the file wasn't moved,
        # so the list of new paths is complete based on the source list.
        new_paths.append(new_path)


# Print the new paths
print("\nNew paths of the moved files:")
for path in new_paths:
  print (path)




In [None]:
!pip install fitz

In [None]:
# Uninstall fitz to ensure a clean install of PyMuPDF
# !pip uninstall -y fitz

# Install PyMuPDF, which is the recommended package
!pip install PyMuPDF

In [None]:
!pip install anthropic

# V1

In [None]:
# ==============================================================================
# 1. IMPORT LIBRARIES AND DEFINE DATA STRUCTURE
# ==============================================================================
import fitz  # PyMuPDF
import pandas as pd
import json
import re
from datetime import datetime
from typing import List, Dict, Tuple, Optional
from google.colab import drive, userdata
from transformers import pipeline
import anthropic
from dataclasses import dataclass, asdict
import warnings
import os

warnings.filterwarnings('ignore')

@dataclass
class StructuredSolicitationObject:
    """
    Structured object containing solicitation metadata and required skills.
    """
    # Metadata
    solicitation_id: str
    title: str
    abstract: str
    processed_at: str
    pdf_filename: str

    # Skills from both paths
    narrative_skills: List[str]  # From Claude API (Path A)
    formal_topics: List[Dict]    # From OpenAlex classifier (Path B)

    # Final combined checklist
    required_skills_checklist: List[str]

    # Processing details
    text_length: int
    processing_method: str = "hybrid_deconstruction"

    def to_dict(self):
        """Convert to dictionary for JSON serialization."""
        return asdict(self)

    def to_json(self, filepath: str):
        """Save to JSON file."""
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)

# ==============================================================================
# 2. PDF SOLICITATION PROCESSOR CLASS
# ==============================================================================
class PDFSolicitationProcessor:
    """
    Processes PDF solicitations from a file path to extract required skills.
    """

    def __init__(self):
        self.claude_client = None
        self.topic_classifier = None
        self.setup_models()

    def setup_models(self):
        """Initialize Claude API client and OpenAlex topic classifier."""
        print("Setting up models...")
        try:
            api_key = userdata.get('ANTHROPIC_API_KEY')
            self.claude_client = anthropic.Anthropic(api_key=api_key)
            print("✅ Claude API client initialized")
        except Exception as e:
            print(f"⚠️ Claude API setup failed: {e}")

        try:
            print("Loading OpenAlex topic classifier...")
            self.topic_classifier = pipeline(
                "text-classification",
                model="OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract"
            )
            print("✅ OpenAlex topic classifier loaded")
        except Exception as e:
            print(f"⚠️ Topic classifier setup failed: {e}")

    def _extract_text_from_pdf(self, filepath: str) -> Tuple[str, str, str]:
        """Extracts text content from a PDF given a file path."""
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"The file was not found at: {filepath}")

        filename = os.path.basename(filepath)
        print(f"📄 Processing: {filename}")

        try:
            doc = fitz.open(filepath)
            full_text = "".join([page.get_text() for page in doc])
            doc.close()

            if not full_text.strip():
                 raise ValueError("Extracted text is empty. The PDF might be an image.")

            title, abstract = self._extract_title_and_abstract(full_text, filename)
            print(f"✅ Extracted {len(full_text)} characters from PDF.")
            return filename, title, abstract
        except Exception as e:
            print(f"❌ Error extracting text from PDF '{filename}': {e}")
            raise

    def _extract_title_and_abstract(self, full_text: str, filename: str) -> Tuple[str, str]:
        """Extract title and abstract from full text using heuristics."""
        lines = [line.strip() for line in full_text.split('\n') if line.strip()]
        title = filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
        for line in lines[:15]: # Check more lines for title
            if 20 < len(line) < 250 and not line.isupper(): # Avoid all-caps headers
                title = line
                break

        abstract = ""
        abstract_started = False
        for line in lines:
            line_lower = line.lower()
            if not abstract_started and any(marker in line_lower for marker in ['abstract', 'summary', 'overview']):
                abstract_started = True
                if len(line) > len('abstract') + 10: abstract += line.split(maxsplit=1)[1]
                continue
            if abstract_started:
                abstract += " " + line
                if len(abstract) > 1500 or any(marker in line_lower for marker in ['introduction', 'background']):
                    break
        if not abstract: abstract = ' '.join(lines[:10]) # Fallback
        return title.strip(), abstract.strip()[:2000] # Increased limit

    def extract_narrative_skills_claude(self, text: str) -> List[str]:
        """Path A: Extract narrative skills using Claude API."""
        if not self.claude_client:
            print("⚠️ Claude API not available, skipping narrative skills.")
            return []

        prompt = f"""As an expert research program analyst, identify the 5-7 most critical and distinct areas of expertise required by this research solicitation. Focus on specific technical skills, domain knowledge, and methodological expertise.

Solicitation text:
---
{text}
---

Provide your response as a numbered list of distinct expertise areas. Each item should be a concise phrase.
"""
        try:
            print("🤖 Calling Claude API for narrative skills...")
            response = self.claude_client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1000,
                temperature=0.2,
                messages=[{"role": "user", "content": prompt}]
            )
            skills = self._parse_claude_response(response.content[0].text)
            print(f"✅ Extracted {len(skills)} narrative skills from Claude.")
            return skills
        except Exception as e:
            print(f"⚠️ Claude API call failed: {e}")
            return []

    def _parse_claude_response(self, response_text: str) -> List[str]:
        """Parse Claude's response to extract a list of skills."""
        skills = []
        for line in response_text.split('\n'):
            line = line.strip()
            if re.match(r'^\d+\.\s*', line):
                skill = re.sub(r'^\d+\.\s*', '', line)
                skills.append(skill.strip())
        return skills[:7]

    def extract_formal_topics_openalex(self, title: str, abstract: str) -> List[Dict]:
      """
      Path B: Extract formal topics using OpenAlex classifier with corrected data structure handling.
      """
      if not self.topic_classifier:
          print("⚠️ Topic classifier not available, skipping formal topics.")
          return []

      formatted_text = f"<TITLE> {title}\n<ABSTRACT> {abstract}"
      print("🔬 Running OpenAlex topic classification...")

      try:
          # Get predictions from the model. The output is a simple list of dicts.
          predictions = self.topic_classifier(formatted_text, top_k=10, truncation=True)
          # print(f"   [DEBUG] Raw output from OpenAlex model: {predictions}") # You can remove this now

          if not predictions:
              print("   OpenAlex model returned no valid predictions.")
              return []

          # --- CORRECTED LOOP ---
          # We iterate directly over 'predictions', which is the list of dictionaries.
          formal_topics = []
          for topic in predictions:
              # Check if the item is a dictionary with the keys we need
              if isinstance(topic, dict) and 'label' in topic and 'score' in topic:
                  # We can now lower the threshold since we see the scores are generally low
                  if topic['score'] > 0.01: # Lowered threshold to include the results
                      formal_topics.append({
                          'topic': topic['label'],
                          'score': round(topic['score'], 4)
                      })
              else:
                  print(f"   ⚠️ Skipping unexpected item in model predictions: {topic}")

          print(f"✅ Extracted {len(formal_topics)} formal topics from OpenAlex.")
          return formal_topics

      except Exception as e:
          print(f"⚠️ An exception occurred during topic classification: {e}")
          return []

    def fusion_logic(self, narrative_skills: List[str], formal_topics: List[Dict]) -> List[str]:
        """Combine narrative skills and formal topics, removing duplicates."""
        print("🔄 Applying fusion logic...")
        combined_skills = list(narrative_skills)
        narrative_lower = ' '.join(narrative_skills).lower()

        for topic in formal_topics:
            topic_name = topic['topic'].split(': ', 1)[-1] # Remove ID like "123: "
            is_duplicate = topic_name.lower() in narrative_lower
            if not is_duplicate:
                combined_skills.append(f"Expertise in {topic_name}")

        print(f"✅ Created final checklist with {len(combined_skills)} skills.")
        return combined_skills

    def process_solicitation(self, pdf_filepath: str) -> Optional[StructuredSolicitationObject]:
        """Main processing pipeline for a PDF solicitation from a given path."""
        print("🚀 Starting PDF Solicitation Processing Pipeline")
        print("=" * 60)
        try:
            filename, title, abstract = self._extract_text_from_pdf(pdf_filepath)

            # Input for Claude can be a simple combination
            claude_input_text = f"Title: {title}. Abstract: {abstract}"
            print(f"\n📊 Text stats for analysis: {len(claude_input_text)} characters.")

            # Path A: Claude
            narrative_skills = self.extract_narrative_skills_claude(claude_input_text)

            # Path B: OpenAlex (uses corrected function call)
            formal_topics = self.extract_formal_topics_openalex(title, abstract)

            # Path C: Fusion
            required_skills_checklist = self.fusion_logic(narrative_skills, formal_topics)

            solicitation_obj = StructuredSolicitationObject(
                solicitation_id=f"SOL_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                title=title, abstract=abstract,
                processed_at=datetime.now().isoformat(),
                pdf_filename=filename,
                narrative_skills=narrative_skills,
                formal_topics=formal_topics,
                required_skills_checklist=required_skills_checklist,
                text_length=len(claude_input_text))

            output_filename = f"{filename.replace('.pdf', '')}_analysis.json"
            solicitation_obj.to_json(output_filename)
            print(f"\n✅ Processing complete! Saved to: {output_filename}")
            return solicitation_obj
        except Exception as e:
            print(f"❌ A fatal error occurred during processing: {e}")
            return None

    def display_results(self, solicitation_obj: Optional[StructuredSolicitationObject]):
        """Display processing results in a readable format."""
        if not solicitation_obj:
            print("\nNo results to display due to a processing error.")
            return
        print("\n" + "="*60)
        print("📋 SOLICITATION PROCESSING RESULTS")
        print("="*60)
        print(f"🆔 ID: {solicitation_obj.solicitation_id}")
        print(f"📄 File: {solicitation_obj.pdf_filename}")
        print(f"📝 Title: {solicitation_obj.title}")
        print(f"\n🤖 Path A - Narrative Skills (Claude):")
        for i, skill in enumerate(solicitation_obj.narrative_skills, 1): print(f"   {i}. {skill}")
        print(f"\n🔬 Path B - Formal Topics (OpenAlex):")
        for i, topic in enumerate(solicitation_obj.formal_topics, 1): print(f"   {i}. {topic['topic']} (Score: {topic['score']:.3f})")
        print(f"\n✅ Final Hybrid Skills Checklist:")
        for i, skill in enumerate(solicitation_obj.required_skills_checklist, 1): print(f"   {i}. {skill}")
        print("\n" + "="*60)


# ==============================================================================
# 3. MAIN EXECUTION
# ==============================================================================
def main():
    """Main function to run the solicitation processing pipeline."""
    print("📄 PDF Solicitation Processor - Hybrid Deconstruction Engine")
    print("=" * 60)

    # --- ⚠️ IMPORTANT ⚠️ ---
    # UPDATE THIS PATH to the location of your PDF file in Google Drive.
    # PDF_FILE_PATH = "/content/drive/My Drive/datastore/NSF 24-569_ Mathematical Foundations of Artificial Intelligence (MFAI) _ NSF - National Science Foundation.pdf"
    PDF_FILE_PATH = "/content/drive/MyDrive/datastore/NSF 25-530: Collaborations in Artificial Intelligence and Geosciences (CAIG) | NSF - National Science Foundation.pdf"

    # ---

    try:
        processor = PDFSolicitationProcessor()
        result_obj = processor.process_solicitation(PDF_FILE_PATH)
        processor.display_results(result_obj)
    except Exception as e:
        print(f"❌ An error occurred in the main function: {e}")

if __name__ == "__main__":
    try:
        print("Mounting Google Drive...")
        drive.mount('/content/drive', force_remount=False) # Force remount to ensure it's fresh
        print("✅ Google Drive mounted successfully.")
        main()
    except Exception as e:
        print(f"A critical error occurred: {e}")

In [None]:
# PDF_FILE_PATH = "/content/drive/My Drive/datastore/NSF 24-569_ Mathematical Foundations of Artificial Intelligence (MFAI) _ NSF - National Science Foundation.pdf"
PDF_FILE_PATH = "/content/drive/MyDrive/datastore/NSF 25-530: Collaborations in Artificial Intelligence and Geosciences (CAIG) | NSF - National Science Foundation.pdf"
processor = PDFSolicitationProcessor()
result_obj = processor.process_solicitation(PDF_FILE_PATH)
processor.display_results(result_obj)


import pandas as pd
# Convert the result_obj to a dictionary
result_dict = result_obj.to_dict()

# Create a DataFrame from the dictionary
# Since result_obj has nested lists/dicts, pandas might struggle to flatten it directly.
# We can create a DataFrame with one row, where each column is the value of the corresponding key.
# For list/dict values, they will be stored as objects in the cell.
df_result = pd.DataFrame([result_dict])

# Alternatively, if you want to normalize or flatten specific nested structures
# you would need to process the dictionary before creating the DataFrame.
# For example, to flatten formal_topics into separate columns:
# df_formal_topics = pd.DataFrame(result_obj.formal_topics)
# Then merge or combine this with the main DataFrame if needed.
# For this task, a single-row DataFrame storing complex objects is sufficient.

print("\nDataFrame created from result_obj:")
print(df_result.head())
print(f"\nDataFrame shape: {df_result.shape}")
print(f"DataFrame columns: {list(df_result.columns)}")


In [None]:
df_result

In [None]:
skills

In [None]:
 # Ensure the previous cell ran successfully and result_obj exists
if 'result_obj' in locals() and result_obj is not None:
    # Convert the result_obj to a dictionary
    result_dict = result_obj.to_dict()

    # Create a DataFrame from the dictionary for inspection
    df_result = pd.DataFrame([result_dict])

    print("\nDataFrame created from result_obj:")
    print(df_result.head())
    print(f"\nDataFrame shape: {df_result.shape}")
    print(f"DataFrame columns: {list(df_result.columns)}")

    # --- CRITICAL STEP ---
    # Extract the skills checklist for the next phase of analysis
    skills_for_analysis = result_obj.required_skills_checklist
    solicitation_id_for_analysis = result_obj.solicitation_id

    print(f"\n✅ Extracted {len(skills_for_analysis)} skills for affinity analysis.")
    print("Skills are now ready for the Skill Affinity Engine.")
else:
    print("⚠️ 'result_obj' not found or is None. Please run the first cell successfully.")
    # Provide a default list to prevent the next cell from failing, or handle as needed
    skills_for_analysis = []
    solicitation_id_for_analysis = "SOL_ERROR"

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

class SkillAffinityEngine:
    """
    Phase 1: Core Analysis - Calculates affinity between researchers and required skills.
    """
    def __init__(self, datastore_path="/content/drive/My Drive/datastore/"):
        self.datastore_path = datastore_path
        self.model = None
        self.metadata_df = None
        self.embeddings_array = None
        self.embedding_index = None
        self.researcher_index = None
        self.datastore_info = None

    def load_datastore(self):
        """Load the complete researcher profile datastore."""
        print("📂 Loading Researcher Profile Datastore...")
        try:
            self.metadata_df = pd.read_parquet(f"{self.datastore_path}researcher_profiles_metadata.parquet")
            self.embeddings_array = np.load(f"{self.datastore_path}researcher_embeddings.npy")
            with open(f"{self.datastore_path}embedding_index.json", 'r') as f: self.embedding_index = json.load(f)
            with open(f"{self.datastore_path}researcher_index.json", 'r') as f: self.researcher_index = json.load(f)
            with open(f"{self.datastore_path}datastore_info.json", 'r') as f: self.datastore_info = json.load(f)
            print(f"🎯 Datastore ready: {self.metadata_df['researcher_id'].nunique()} researchers, {len(self.metadata_df)} papers")
        except Exception as e:
            raise Exception(f"Failed to load datastore: {e}")

    def load_model(self, model_name='all-MiniLM-L6-v2'):
        """Load the sentence transformer model."""
        print(f"🤖 Loading sentence transformer model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print(f"✅ Model loaded.")

    def embed_skills(self, skills_checklist):
        """Embed each skill phrase."""
        if self.model is None: self.load_model()
        print(f"🧠 Embedding {len(skills_checklist)} skills...")
        skill_embeddings = self.model.encode(skills_checklist, show_progress_bar=True)
        print(f"✅ Created skill embeddings: {skill_embeddings.shape}")
        return skill_embeddings

    def get_researcher_data(self, researcher_id):
        """Get data for a specific researcher."""
        work_ids = self.researcher_index.get(researcher_id, [])
        positions = [self.embedding_index[wid] for wid in work_ids if wid in self.embedding_index]
        if not positions: return np.array([]), [], []
        paper_embeddings = self.embeddings_array[positions]
        paper_metadata = self.metadata_df[self.metadata_df['work_id'].isin(work_ids)]
        weight_mapping = dict(zip(paper_metadata['work_id'], paper_metadata['recency_weight']))
        recency_weights = np.array([weight_mapping.get(wid, 0.0) for wid in work_ids if wid in self.embedding_index])
        return paper_embeddings, work_ids, recency_weights

    def calculate_skill_affinity_score(self, paper_embeddings, skill_embedding, recency_weights):
        """Calculate affinity score for one researcher against one skill."""
        if len(paper_embeddings) == 0: return 0.0
        cosine_sims = cosine_similarity(paper_embeddings, skill_embedding.reshape(1, -1)).flatten()
        max_weighted_sim = np.max(cosine_sims * recency_weights)
        return np.clip(max_weighted_sim * 100, 0, 100)

    def create_affinity_matrix(self, skills_checklist, solicitation_id=None):
        """Create the complete affinity matrix."""
        print("🎯 Creating Affinity Matrix...")
        print("=" * 50)
        if self.metadata_df is None: self.load_datastore()

        # --- FIX: Check for empty skills_checklist ---
        if not skills_checklist:
            raise ValueError("Skills checklist is empty. Cannot create affinity matrix.")

        skill_embeddings = self.embed_skills(skills_checklist)
        unique_researchers = list(self.researcher_index.keys())
        print(f"📊 Processing {len(unique_researchers)} researchers × {len(skills_checklist)} skills")
        affinity_matrix = np.zeros((len(unique_researchers), len(skills_checklist)))

        for i, researcher_id in enumerate(tqdm(unique_researchers, desc="Processing researchers")):
            paper_embeddings, _, recency_weights = self.get_researcher_data(researcher_id)
            if len(paper_embeddings) == 0: continue
            for j, skill_embedding in enumerate(skill_embeddings):
                affinity_matrix[i, j] = self.calculate_skill_affinity_score(paper_embeddings, skill_embedding, recency_weights)

        researcher_names = [self.metadata_df[self.metadata_df['researcher_id'] == rid].iloc[0]['researcher_name'] for rid in unique_researchers]
        skill_columns = [f"Skill_{i+1:02d}: {skill[:50]}" for i, skill in enumerate(skills_checklist)]
        affinity_df = pd.DataFrame(affinity_matrix, index=researcher_names, columns=skill_columns)
        print(f"✅ Affinity Matrix created: {affinity_df.shape}")
        return affinity_df, unique_researchers, skills_checklist

    def analyze_affinity_matrix(self, affinity_df, skills_checklist):
        """Provide analysis on the affinity matrix."""
        print("\n" + "="*60 + "\n📊 AFFINITY MATRIX ANALYSIS\n" + "="*60)
        print(f"📏 Matrix dimensions: {affinity_df.shape[0]} researchers × {affinity_df.shape[1]} skills")
        print(f"📈 Score range: {affinity_df.values.min():.2f} - {affinity_df.values.max():.2f}")
        print(f"📊 Mean affinity score: {affinity_df.values.mean():.2f}")
        researcher_avg_scores = affinity_df.mean(axis=1).sort_values(ascending=False)
        print(f"\n🏆 Top 5 Researchers (by average affinity):\n{researcher_avg_scores.head().to_string(float_format='%.2f')}")
        skill_avg_scores = affinity_df.mean(axis=0).sort_values()
        print(f"\n🎯 Most Challenging Skills (lowest average affinity):")
        for skill_col, score in skill_avg_scores.head().items():
            original_skill_index = int(skill_col.split('_')[1].split(':')[0]) - 1
            print(f"   - {skills_checklist[original_skill_index][:60]}...: {score:.2f}")


def main_affinity_analysis(skills_checklist, solicitation_id):
    """Main function to run the affinity analysis."""
    print("\n🎯 SKILL AFFINITY ENGINE - PHASE 1 CORE ANALYSIS")
    print("="*60)
    engine = SkillAffinityEngine()

    # Run the full pipeline
    affinity_df, unique_researchers, skills_list = engine.create_affinity_matrix(
        skills_checklist, solicitation_id
    )
    engine.analyze_affinity_matrix(affinity_df, skills_list)
    return affinity_df

# ==============================================================================
# 4. MAIN EXECUTION FOR AFFINITY ANALYSIS
# ==============================================================================
if __name__ == "__main__":
    # Ensure the previous cells have run and skills_for_analysis is available
    if 'skills_for_analysis' in locals() and skills_for_analysis:
        print("Running Skill Affinity Engine with extracted skills...")
        try:
            # Pass the extracted skills and ID to the analysis function
            affinity_matrix = main_affinity_analysis(skills_for_analysis, solicitation_id_for_analysis)
            print(f"\n📊 Sample of Affinity Matrix:")
            print(affinity_matrix.iloc[:5, :5])

        except Exception as e:
            print(f"❌ An error occurred during affinity analysis: {e}")
    else:
        print("⚠️ Cannot run affinity analysis because the skills checklist is empty or not defined.")
        print("Please run the first two cells successfully to extract skills from the PDF.")

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import anthropic
from google.colab import userdata
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

class DreamTeamAssembler:
    """
    Phase 2: Dream Team Assembly & Strategic Output

    Implements the greedy algorithm to select optimal research teams and generates
    comprehensive strategic reports using AI analysis.
    """

    def __init__(self):
        self.claude_client = None
        self.setup_claude_api()

    def setup_claude_api(self):
        """Initialize Claude API client for gap analysis."""
        try:
            api_key = userdata.get('ANTHROPIC_API_KEY')
            self.claude_client = anthropic.Anthropic(api_key=api_key)
            print("✅ Claude API client initialized for gap analysis")
        except Exception as e:
            print(f"⚠️ Claude API setup failed: {e}")
            print("Gap analysis will be limited without API access")

    def load_affinity_matrix(self, csv_path, metadata_path=None):
        """
        Load the affinity matrix and associated metadata.

        Args:
            csv_path (str): Path to affinity matrix CSV
            metadata_path (str): Optional path to metadata JSON

        Returns:
            tuple: (affinity_df, metadata)
        """
        print(f"📊 Loading affinity matrix from: {csv_path}")

        # Load the matrix
        affinity_df = pd.read_csv(csv_path, index_col=0)
        print(f"✅ Loaded matrix: {affinity_df.shape[0]} researchers × {affinity_df.shape[1]} skills")

        # Load metadata if available
        metadata = None
        if metadata_path:
            try:
                with open(metadata_path, 'r') as f:
                    metadata = json.load(f)
                print(f"✅ Loaded metadata")
            except Exception as e:
                print(f"⚠️ Could not load metadata: {e}")

        return affinity_df, metadata

    def calculate_team_coverage(self, affinity_df, team_indices):
        """
        Calculate team coverage scores for all skills.

        Args:
            affinity_df (pd.DataFrame): Affinity matrix
            team_indices (list): Indices of selected team members

        Returns:
            tuple: (skill_coverages, overall_coverage_score)
        """
        if not team_indices:
            return [], 0.0

        # Get affinity scores for team members
        team_affinities = affinity_df.iloc[team_indices]

        # For each skill, take the maximum affinity among team members
        skill_coverages = team_affinities.max(axis=0).values

        # Overall team coverage score is the average
        overall_coverage_score = np.mean(skill_coverages)

        return skill_coverages, overall_coverage_score

    def calculate_marginal_gain(self, affinity_df, current_team_indices, candidate_index):
        """
        Calculate marginal gain of adding a candidate to the current team.

        Args:
            affinity_df (pd.DataFrame): Affinity matrix
            current_team_indices (list): Current team member indices
            candidate_index (int): Index of candidate researcher

        Returns:
            float: Marginal gain in coverage score
        """
        # Current team coverage
        _, current_coverage = self.calculate_team_coverage(affinity_df, current_team_indices)

        # New team coverage with candidate added
        new_team_indices = current_team_indices + [candidate_index]
        _, new_coverage = self.calculate_team_coverage(affinity_df, new_team_indices)

        # Marginal gain
        marginal_gain = new_coverage - current_coverage

        return marginal_gain

    def dream_team_greedy_algorithm(self, affinity_df, min_team_size=2, max_team_size=4):
        """
        Implement the Dream Team Greedy Algorithm.

        Args:
            affinity_df (pd.DataFrame): Affinity matrix
            min_team_size (int): Minimum team size
            max_team_size (int): Maximum team size

        Returns:
            tuple: (selected_team_indices, selection_history)
        """
        print("🎯 Running Dream Team Greedy Algorithm...")
        print("=" * 50)

        n_researchers = len(affinity_df)
        selected_indices = []
        selection_history = []

        # Step 1: Select the best overall researcher (likely PI)
        researcher_avg_scores = affinity_df.mean(axis=1)
        best_researcher_idx = researcher_avg_scores.idxmax()
        best_researcher_pos = affinity_df.index.get_loc(best_researcher_idx)

        selected_indices.append(best_researcher_pos)
        _, initial_coverage = self.calculate_team_coverage(affinity_df, selected_indices)

        selection_history.append({
            'step': 1,
            'action': 'initial_selection',
            'researcher_idx': best_researcher_pos,
            'researcher_name': affinity_df.index[best_researcher_pos],
            'reason': 'Highest average affinity score (likely PI)',
            'team_coverage': initial_coverage,
            'marginal_gain': initial_coverage
        })

        print(f"🏆 Step 1 - PI Selection: {affinity_df.index[best_researcher_pos]}")
        print(f"    Initial coverage: {initial_coverage:.2f}")

        # Steps 2-4: Iteratively add researchers with maximum marginal gain
        for step in range(2, max_team_size + 1):
            best_candidate_idx = None
            best_marginal_gain = -1
            candidate_gains = []

            # Evaluate all remaining researchers
            for candidate_idx in range(n_researchers):
                if candidate_idx in selected_indices:
                    continue  # Skip already selected researchers

                marginal_gain = self.calculate_marginal_gain(
                    affinity_df, selected_indices, candidate_idx
                )
                candidate_gains.append((candidate_idx, marginal_gain))

                if marginal_gain > best_marginal_gain:
                    best_marginal_gain = marginal_gain
                    best_candidate_idx = candidate_idx

            # Add the best candidate if marginal gain is positive and we haven't reached min size
            # Or if marginal gain is significant enough and we're expanding beyond min size
            should_add = False
            if len(selected_indices) < min_team_size:
                should_add = True  # Must reach minimum team size
            elif best_marginal_gain > 0.5:  # Only add if significant improvement
                should_add = True

            if should_add and best_candidate_idx is not None:
                selected_indices.append(best_candidate_idx)
                _, new_coverage = self.calculate_team_coverage(affinity_df, selected_indices)

                selection_history.append({
                    'step': step,
                    'action': 'add_member',
                    'researcher_idx': best_candidate_idx,
                    'researcher_name': affinity_df.index[best_candidate_idx],
                    'reason': f'Maximum marginal gain (+{best_marginal_gain:.2f})',
                    'team_coverage': new_coverage,
                    'marginal_gain': best_marginal_gain
                })

                print(f"✅ Step {step} - Added: {affinity_df.index[best_candidate_idx]}")
                print(f"    Marginal gain: +{best_marginal_gain:.2f}, New coverage: {new_coverage:.2f}")
            else:
                print(f"🛑 Step {step} - Stopping: No significant marginal gain (best: +{best_marginal_gain:.2f})")
                break

        print(f"\n🎯 Final Dream Team: {len(selected_indices)} researchers")
        print(f"📊 Final team coverage: {self.calculate_team_coverage(affinity_df, selected_indices)[1]:.2f}")

        return selected_indices, selection_history

    def generate_coverage_report(self, affinity_df, team_indices, skills_list):
        """
        Generate detailed coverage report for the selected team.

        Args:
            affinity_df (pd.DataFrame): Affinity matrix
            team_indices (list): Selected team member indices
            skills_list (list): List of skill descriptions

        Returns:
            dict: Comprehensive coverage report
        """
        print("📋 Generating Coverage Report...")

        # Calculate coverage
        skill_coverages, overall_coverage = self.calculate_team_coverage(affinity_df, team_indices)

        # Team member details
        team_members = []
        for idx in team_indices:
            researcher_name = affinity_df.index[idx]
            researcher_scores = affinity_df.iloc[idx].values
            avg_score = np.mean(researcher_scores)
            max_score = np.max(researcher_scores)

            # Find top skills for this researcher
            top_skill_indices = np.argsort(researcher_scores)[-3:][::-1]  # Top 3
            top_skills = [(skills_list[i], researcher_scores[i]) for i in top_skill_indices]

            team_members.append({
                'name': researcher_name,
                'index': idx,
                'avg_affinity': avg_score,
                'max_affinity': max_score,
                'top_skills': top_skills,
                'all_scores': researcher_scores.tolist()
            })

        # Skill coverage analysis
        skill_analysis = []
        for i, (skill, coverage) in enumerate(zip(skills_list, skill_coverages)):
            # Find which team member provides this coverage
            team_scores_for_skill = [member['all_scores'][i] for member in team_members]
            best_member_idx = np.argmax(team_scores_for_skill)
            best_member = team_members[best_member_idx]

            coverage_level = 'High' if coverage >= 70 else 'Medium' if coverage >= 40 else 'Low'

            skill_analysis.append({
                'skill': skill,
                'coverage_score': coverage,
                'coverage_level': coverage_level,
                'primary_expert': best_member['name'],
                'expert_score': team_scores_for_skill[best_member_idx]
            })

        # Coverage statistics
        high_coverage_count = sum(1 for s in skill_analysis if s['coverage_level'] == 'High')
        medium_coverage_count = sum(1 for s in skill_analysis if s['coverage_level'] == 'Medium')
        low_coverage_count = sum(1 for s in skill_analysis if s['coverage_level'] == 'Low')

        coverage_report = {
            'team_size': len(team_members),
            'overall_coverage_score': overall_coverage,
            'team_members': team_members,
            'skill_analysis': skill_analysis,
            'coverage_statistics': {
                'high_coverage_skills': high_coverage_count,
                'medium_coverage_skills': medium_coverage_count,
                'low_coverage_skills': low_coverage_count,
                'coverage_distribution': {
                    'high_pct': 100 * high_coverage_count / len(skills_list),
                    'medium_pct': 100 * medium_coverage_count / len(skills_list),
                    'low_pct': 100 * low_coverage_count / len(skills_list)
                }
            }
        }

        print(f"✅ Coverage report generated")
        print(f"📊 Overall coverage: {overall_coverage:.2f}")
        print(f"🔥 High coverage skills: {high_coverage_count}/{len(skills_list)}")

        return coverage_report

    def format_gap_analysis_prompt(self, coverage_report, skills_list, solicitation_data=None):
        """
        Format the prompt for Claude API gap analysis.

        Args:
            coverage_report (dict): Team coverage analysis
            skills_list (list): Required skills
            solicitation_data (dict): Original solicitation information

        Returns:
            str: Formatted prompt for Claude API
        """

        # Team summary
        team_summary = f"PROPOSED RESEARCH TEAM ({coverage_report['team_size']} members):\n"
        for i, member in enumerate(coverage_report['team_members']):
            role = "Principal Investigator (PI)" if i == 0 else f"Co-Investigator {i}"
            team_summary += f"\n{i+1}. {member['name']} - {role}\n"
            team_summary += f"   Average Affinity: {member['avg_affinity']:.2f}\n"
            team_summary += f"   Top Expertise Areas:\n"
            for skill, score in member['top_skills']:
                team_summary += f"     • {skill}: {score:.1f}\n"

        # Coverage analysis
        coverage_summary = f"\nTEAM COVERAGE ANALYSIS:\n"
        coverage_summary += f"Overall Team Coverage Score: {coverage_report['overall_coverage_score']:.2f}/100\n\n"

        coverage_summary += "HIGH COVERAGE SKILLS (≥70):\n"
        for skill in coverage_report['skill_analysis']:
            if skill['coverage_level'] == 'High':
                coverage_summary += f"• {skill['skill']}: {skill['coverage_score']:.1f} (Expert: {skill['primary_expert']})\n"

        coverage_summary += "\nMEDIUM COVERAGE SKILLS (40-69):\n"
        for skill in coverage_report['skill_analysis']:
            if skill['coverage_level'] == 'Medium':
                coverage_summary += f"• {skill['skill']}: {skill['coverage_score']:.1f} (Expert: {skill['primary_expert']})\n"

        coverage_summary += "\nLOW COVERAGE SKILLS (<40) - POTENTIAL GAPS:\n"
        for skill in coverage_report['skill_analysis']:
            if skill['coverage_level'] == 'Low':
                coverage_summary += f"• {skill['skill']}: {skill['coverage_score']:.1f} (Expert: {skill['primary_expert']})\n"

        # Solicitation context
        solicitation_context = ""
        if solicitation_data:
            solicitation_context = f"\nORIGINAL SOLICITATION CONTEXT:\n"
            solicitation_context += f"Title: {solicitation_data.get('title', 'N/A')}\n"
            solicitation_context += f"Abstract: {solicitation_data.get('abstract', 'N/A')[:500]}...\n"

        # Main prompt
        prompt = f"""As an expert NSF Program Manager and research strategy consultant, analyze this proposed research team for a competitive grant application.

{team_summary}

{coverage_summary}

{solicitation_context}

Please provide a comprehensive strategic analysis covering:

1. **TEAM STRENGTHS**: What are the key strengths of this team composition? How do their expertise areas complement each other?

2. **COVERAGE GAPS & RISKS**: Analyze the low-coverage skills. Are these critical gaps that could harm competitiveness? Which gaps are most concerning?

3. **STRATEGIC RECOMMENDATIONS**:
   - Should additional collaborators be recruited for specific gaps?
   - How can the team leverage their strengths to compensate for weaknesses?
   - What sections of the proposal should each member lead?

4. **COMPETITIVE POSITIONING**: How competitive is this team compared to typical NSF applications? What makes them stand out?

5. **PROPOSAL STRATEGY**: Provide 3-5 specific, actionable recommendations for structuring their proposal to maximize success.

Format your response as a professional strategic report suitable for team planning meetings."""

        return prompt

    def generate_gap_analysis(self, coverage_report, skills_list, solicitation_data=None):
        """
        Generate AI-powered gap analysis using Claude API.

        Args:
            coverage_report (dict): Team coverage analysis
            skills_list (list): Required skills list
            solicitation_data (dict): Optional solicitation data

        Returns:
            str: Strategic analysis from Claude API
        """
        if not self.claude_client:
            return self._generate_fallback_analysis(coverage_report)

        print("🤖 Generating strategic analysis with Claude API...")

        try:
            prompt = self.format_gap_analysis_prompt(coverage_report, skills_list, solicitation_data)

            response = self.claude_client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=2000,
                temperature=0.7,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            analysis = response.content[0].text
            print("✅ Strategic analysis generated")
            return analysis

        except Exception as e:
            print(f"⚠️ Claude API analysis failed: {e}")
            return self._generate_fallback_analysis(coverage_report)

    def _generate_fallback_analysis(self, coverage_report):
        """Generate basic analysis when Claude API is unavailable."""

        analysis = "STRATEGIC ANALYSIS (Basic Report)\n"
        analysis += "=" * 50 + "\n\n"

        analysis += f"TEAM OVERVIEW:\n"
        analysis += f"Team Size: {coverage_report['team_size']} researchers\n"
        analysis += f"Overall Coverage: {coverage_report['overall_coverage_score']:.2f}/100\n\n"

        analysis += "COVERAGE DISTRIBUTION:\n"
        stats = coverage_report['coverage_statistics']
        analysis += f"High Coverage Skills: {stats['high_coverage_skills']} ({stats['coverage_distribution']['high_pct']:.1f}%)\n"
        analysis += f"Medium Coverage Skills: {stats['medium_coverage_skills']} ({stats['coverage_distribution']['medium_pct']:.1f}%)\n"
        analysis += f"Low Coverage Skills: {stats['low_coverage_skills']} ({stats['coverage_distribution']['low_pct']:.1f}%)\n\n"

        if stats['low_coverage_skills'] > 0:
            analysis += "ATTENTION NEEDED:\n"
            analysis += f"The team has {stats['low_coverage_skills']} skills with low coverage. Consider recruiting additional expertise or developing strategic partnerships.\n\n"

        analysis += "RECOMMENDATIONS:\n"
        analysis += "• Review low-coverage skills for recruitment opportunities\n"
        analysis += "• Leverage high-coverage areas as competitive advantages\n"
        analysis += "• Consider collaborative arrangements for gap areas\n"

        return analysis

    def create_strategic_report(self, affinity_df, metadata=None, skills_list=None, solicitation_data=None):
        """
        Main function to create comprehensive strategic report.

        Args:
            affinity_df (pd.DataFrame): Affinity matrix
            metadata (dict): Optional metadata
            skills_list (list): Required skills
            solicitation_data (dict): Original solicitation data

        Returns:
            dict: Complete strategic report
        """
        print("🚀 CREATING STRATEGIC REPORT")
        print("=" * 60)

        # Extract skills list if not provided
        if skills_list is None and metadata:
            skills_list = metadata.get('skills_checklist', [])

        if skills_list is None:
            # Extract from column names
            skills_list = [col.split(': ', 1)[1] if ': ' in col else col for col in affinity_df.columns]

        # Step 1: Run Dream Team Algorithm
        team_indices, selection_history = self.dream_team_greedy_algorithm(affinity_df)

        # Step 2: Generate Coverage Report
        coverage_report = self.generate_coverage_report(affinity_df, team_indices, skills_list)

        # Step 3: Generate Gap Analysis
        strategic_analysis = self.generate_gap_analysis(coverage_report, skills_list, solicitation_data)

        # Step 4: Compile Strategic Report
        strategic_report = {
            'report_metadata': {
                'generated_at': datetime.now().isoformat(),
                'solicitation_id': solicitation_data.get('solicitation_id') if solicitation_data else None,
                'analysis_type': 'dream_team_strategic_report'
            },
            'dream_team': {
                'team_indices': team_indices,
                'selection_algorithm': 'greedy_marginal_gain',
                'selection_history': selection_history
            },
            'coverage_analysis': coverage_report,
            'strategic_analysis': strategic_analysis,
            'skills_checklist': skills_list,
            'solicitation_context': solicitation_data
        }

        print("✅ Strategic Report Generated!")
        return strategic_report

    def save_strategic_report(self, strategic_report, output_path=None):
        """Save the strategic report to files."""

        if output_path is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            output_path = f"strategic_report_{timestamp}"

        # Save complete JSON report
        json_path = f"{output_path}.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(strategic_report, f, indent=2, ensure_ascii=False)

        # Save human-readable text report
        text_path = f"{output_path}.txt"
        with open(text_path, 'w', encoding='utf-8') as f:
            f.write(self.format_text_report(strategic_report))

        print(f"💾 Strategic report saved:")
        print(f"   📄 JSON: {json_path}")
        print(f"   📝 Text: {text_path}")

        return json_path, text_path

    def format_text_report(self, strategic_report):
        """Format strategic report as human-readable text."""

        report = "NSF DREAM TEAM STRATEGIC REPORT\n"
        report += "=" * 60 + "\n\n"

        # Header
        metadata = strategic_report['report_metadata']
        report += f"Generated: {metadata['generated_at']}\n"
        if metadata.get('solicitation_id'):
            report += f"Solicitation ID: {metadata['solicitation_id']}\n"
        report += "\n"

        # Dream Team
        report += "RECOMMENDED DREAM TEAM\n"
        report += "-" * 25 + "\n"

        team_members = strategic_report['coverage_analysis']['team_members']
        for i, member in enumerate(team_members):
            role = "Principal Investigator (PI)" if i == 0 else f"Co-Investigator {i}"
            report += f"\n{i+1}. {member['name']} - {role}\n"
            report += f"   Average Affinity Score: {member['avg_affinity']:.2f}\n"
            report += f"   Primary Expertise:\n"
            for skill, score in member['top_skills'][:2]:  # Top 2 skills
                report += f"     • {skill}: {score:.1f}\n"

        # Coverage Summary
        coverage = strategic_report['coverage_analysis']
        report += f"\nTEAM COVERAGE SUMMARY\n"
        report += "-" * 22 + "\n"
        report += f"Overall Coverage Score: {coverage['overall_coverage_score']:.2f}/100\n"

        stats = coverage['coverage_statistics']
        report += f"High Coverage Skills: {stats['high_coverage_skills']} ({stats['coverage_distribution']['high_pct']:.1f}%)\n"
        report += f"Medium Coverage Skills: {stats['medium_coverage_skills']} ({stats['coverage_distribution']['medium_pct']:.1f}%)\n"
        report += f"Low Coverage Skills: {stats['low_coverage_skills']} ({stats['coverage_distribution']['low_pct']:.1f}%)\n"

        # Strategic Analysis
        report += f"\nSTRATEGIC ANALYSIS\n"
        report += "-" * 18 + "\n"
        report += strategic_report['strategic_analysis']

        return report

    def display_summary(self, strategic_report):
        """Display a summary of the strategic report."""

        print("\n" + "="*60)
        print("📋 DREAM TEAM STRATEGIC REPORT SUMMARY")
        print("="*60)

        # Team overview
        team_members = strategic_report['coverage_analysis']['team_members']
        print(f"🏆 Recommended Team Size: {len(team_members)}")

        for i, member in enumerate(team_members):
            role = "PI" if i == 0 else f"Co-I {i}"
            print(f"   {i+1}. {member['name']} ({role}) - Avg Score: {member['avg_affinity']:.2f}")

        # Coverage stats
        coverage = strategic_report['coverage_analysis']['coverage_statistics']
        print(f"\n📊 Coverage Distribution:")
        print(f"   🔥 High Coverage: {coverage['high_coverage_skills']} skills ({coverage['coverage_distribution']['high_pct']:.1f}%)")
        print(f"   📊 Medium Coverage: {coverage['medium_coverage_skills']} skills ({coverage['coverage_distribution']['medium_pct']:.1f}%)")
        print(f"   ⚠️  Low Coverage: {coverage['low_coverage_skills']} skills ({coverage['coverage_distribution']['low_pct']:.1f}%)")

        overall_score = strategic_report['coverage_analysis']['overall_coverage_score']
        print(f"\n🎯 Overall Team Coverage Score: {overall_score:.2f}/100")

        if overall_score >= 70:
            print("✅ EXCELLENT: Strong team with high coverage")
        elif overall_score >= 50:
            print("✅ GOOD: Solid team with room for strategic partnerships")
        else:
            print("⚠️ NEEDS WORK: Consider additional recruitment or collaborations")

def main_dream_team_analysis(affinity_csv_path, metadata_json_path=None, solicitation_json_path=None):
    """
    Main function to run complete Dream Team analysis.

    Args:
        affinity_csv_path (str): Path to affinity matrix CSV
        metadata_json_path (str): Path to affinity metadata JSON
        solicitation_json_path (str): Path to original solicitation JSON
    """

    print("🎯 DREAM TEAM ASSEMBLER - PHASE 2 STRATEGIC OUTPUT")
    print("="*60)

    # Initialize assembler
    assembler = DreamTeamAssembler()

    # Load affinity matrix
    affinity_df, affinity_metadata = assembler.load_affinity_matrix(affinity_csv_path, metadata_json_path)

    # Load solicitation data if available
    solicitation_data = None
    if solicitation_json_path:
        try:
            with open(solicitation_json_path, 'r') as f:
                solicitation_data = json.load(f)
            print(f"✅ Loaded solicitation data")
        except Exception as e:
            print(f"⚠️ Could not load solicitation data: {e}")

    # Extract skills list
    skills_list = None
    if affinity_metadata and 'skills_checklist' in affinity_metadata:
        skills_list = affinity_metadata['skills_checklist']
    elif solicitation_data and 'required_skills_checklist' in solicitation_data:
        skills_list = solicitation_data['required_skills_checklist']

    # Create strategic report
    strategic_report = assembler.create_strategic_report(
        affinity_df=affinity_df,
        metadata=affinity_metadata,
        skills_list=skills_list,
        solicitation_data=solicitation_data
    )

    # Save report
    json_path, text_path = assembler.save_strategic_report(strategic_report)

    # Display summary
    assembler.display_summary(strategic_report)

    print(f"\n✅ Dream Team Analysis Complete!")
    print(f"📄 Full report available at: {text_path}")

    return strategic_report

if __name__ == "__main__":
    # Example usage - update paths as needed
    AFFINITY_CSV = "/content/drive/MyDrive/datastore/affinity_matrix_SOL_EXAMPLE1.csv"
    AFFINITY_METADATA = "/content/drive/MyDrive/datastore/affinity_matrix_SOL_EXAMPLE1_metadata1.json"
    SOLICITATION_JSON = "NSF 24-569_ Mathematical Foundations of Artificial Intelligence (MFAI) _ NSF - National Science Foundation_analysis.json"  # Optional

    try:
        report = main_dream_team_analysis(
            affinity_csv_path=AFFINITY_CSV,
            metadata_json_path=AFFINITY_METADATA,
            solicitation_json_path=SOLICITATION_JSON
        )
    except Exception as e:
        print(f"❌ Error: {e}")
        print("Please verify file paths and try again")

Researcher Profile Datastore: Built system to extract top 50 Texas State researchers + Tahir Ekin, generating embeddings for all papers using all-MiniLM-L6-v2 sentence transformer model with recency weights Wt = max(0, 1-(CurrentYear-PublicationYear)/10).

Storage Structure: Created indexed datastore in Google Drive with researcher_profiles_metadata.parquet (paper metadata), researcher_embeddings.npy (384-dim vectors), embedding_index.json (work_id→position), researcher_index.json (researcher_id→work_ids).

PDF Solicitation Processor: Hybrid system using PyMuPDF extraction + Claude API (narrative skills) + OpenAlex BERT classifier (formal topics), with fusion logic to create final Required Skills Checklist.

Skill Affinity Engine: Implements mathematical formula SkillAffinityScore(R,Sk) = max(cosine_similarity(paper,skill) × recency_weight) × 100 for every researcher×skill combination.

Affinity Matrix: 51 researchers × 17 skills matrix with percentage scores (0-100), saved as CSV with metadata JSON for tracking and analysis.
Dream Team Greedy Algorithm: Selects optimal 2-4 researcher teams by iteratively choosing candidates with maximum marginal gain in team coverage scores.

Coverage Analysis: Calculates Team Coverage Score (average of max affinity per skill), categorizes skills as High/Medium/Low coverage, identifies primary experts per skill area.

Strategic Report Generation: Uses Claude API to analyze team composition, coverage gaps, competitive positioning, and proposal strategy recommendations.
End-to-End Pipeline: PDF→Skills Extraction→Affinity Calculation→Team Selection→Strategic Analysis, all integrated with proper error handling and fallback methods.

Key Files: Researcher datastore (5 files), solicitation JSON objects, affinity matrices with metadata, and comprehensive strategic reports (JSON + human-readable text formats).

# V2 with better file management


In [None]:
# ==============================================================================
# Cell 1: PDF Processing & Run Initialization
# ==============================================================================
import fitz  # PyMuPDF
import pandas as pd
import json
import re
from datetime import datetime
from typing import List, Dict, Tuple, Optional
from google.colab import drive, userdata
from transformers import pipeline
import anthropic
from dataclasses import dataclass, asdict
import warnings
import os

# --- Only change this section for a new analysis ---
DRIVE_MOUNT_PATH = '/content/drive'
DATASTORE_PATH = "/content/drive/MyDrive/datastore/"
# This is the ONLY line you need to change to process a new file.
PDF_FILE_PATH = "/content/drive/MyDrive/datastore/NSF 25-530: Collaborations in Artificial Intelligence and Geosciences (CAIG) | NSF - National Science Foundation.pdf"


warnings.filterwarnings('ignore')

@dataclass
class StructuredSolicitationObject:
    """
    Structured object containing solicitation metadata and required skills.
    """
    # Metadata
    solicitation_id: str
    title: str
    abstract: str
    processed_at: str
    pdf_filename: str

    # Skills from both paths
    narrative_skills: List[str]  # From Claude API (Path A)
    formal_topics: List[Dict]    # From OpenAlex classifier (Path B)

    # Final combined checklist
    required_skills_checklist: List[str]

    # Processing details
    text_length: int
    processing_method: str = "hybrid_deconstruction"

    def to_dict(self):
        """Convert to dictionary for JSON serialization."""
        return asdict(self)

    def to_json(self, filepath: str):
        """Save to JSON file."""
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)

# ==============================================================================
# 2. PDF SOLICITATION PROCESSOR CLASS
# ==============================================================================


class PDFSolicitationProcessor:
    """
    Processes PDF solicitations from a file path to extract required skills.
    """

    def __init__(self):
        self.claude_client = None
        self.topic_classifier = None
        self.setup_models()

    def setup_models(self):
        """Initialize Claude API client and OpenAlex topic classifier."""
        print("Setting up models...")
        try:
            api_key = userdata.get('ANTHROPIC_API_KEY')
            self.claude_client = anthropic.Anthropic(api_key=api_key)
            print("✅ Claude API client initialized")
        except Exception as e:
            print(f"⚠️ Claude API setup failed: {e}")

        try:
            print("Loading OpenAlex topic classifier...")
            self.topic_classifier = pipeline(
                "text-classification",
                model="OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract"
            )
            print("✅ OpenAlex topic classifier loaded")
        except Exception as e:
            print(f"⚠️ Topic classifier setup failed: {e}")

    def _extract_text_from_pdf(self, filepath: str) -> Tuple[str, str, str]:
        """Extracts text content from a PDF given a file path."""
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"The file was not found at: {filepath}")

        filename = os.path.basename(filepath)
        print(f"📄 Processing: {filename}")

        try:
            doc = fitz.open(filepath)
            full_text = "".join([page.get_text() for page in doc])
            doc.close()

            if not full_text.strip():
                 raise ValueError("Extracted text is empty. The PDF might be an image.")

            title, abstract = self._extract_title_and_abstract(full_text, filename)
            print(f"✅ Extracted {len(full_text)} characters from PDF.")
            return filename, title, abstract
        except Exception as e:
            print(f"❌ Error extracting text from PDF '{filename}': {e}")
            raise

    def _extract_title_and_abstract(self, full_text: str, filename: str) -> Tuple[str, str]:
        """Extract title and abstract from full text using heuristics."""
        lines = [line.strip() for line in full_text.split('\n') if line.strip()]
        title = filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
        for line in lines[:15]: # Check more lines for title
            if 20 < len(line) < 250 and not line.isupper(): # Avoid all-caps headers
                title = line
                break

        abstract = ""
        abstract_started = False
        for line in lines:
            line_lower = line.lower()
            if not abstract_started and any(marker in line_lower for marker in ['abstract', 'summary', 'overview']):
                abstract_started = True
                if len(line) > len('abstract') + 10: abstract += line.split(maxsplit=1)[1]
                continue
            if abstract_started:
                abstract += " " + line
                if len(abstract) > 1500 or any(marker in line_lower for marker in ['introduction', 'background']):
                    break
        if not abstract: abstract = ' '.join(lines[:10]) # Fallback
        return title.strip(), abstract.strip()[:2000] # Increased limit

    def extract_narrative_skills_claude(self, text: str) -> List[str]:
        """Path A: Extract narrative skills using Claude API."""
        if not self.claude_client:
            print("⚠️ Claude API not available, skipping narrative skills.")
            return []

        prompt = f"""As an expert research program analyst, identify the 5-7 most critical and distinct areas of expertise required by this research solicitation. Focus on specific technical skills, domain knowledge, and methodological expertise.

Solicitation text:
---
{text}
---

Provide your response as a numbered list of distinct expertise areas. Each item should be a concise phrase.
"""
        try:
            print("🤖 Calling Claude API for narrative skills...")
            response = self.claude_client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1000,
                temperature=0.2,
                messages=[{"role": "user", "content": prompt}]
            )
            skills = self._parse_claude_response(response.content[0].text)
            print(f"✅ Extracted {len(skills)} narrative skills from Claude.")
            return skills
        except Exception as e:
            print(f"⚠️ Claude API call failed: {e}")
            return []

    def _parse_claude_response(self, response_text: str) -> List[str]:
        """Parse Claude's response to extract a list of skills."""
        skills = []
        for line in response_text.split('\n'):
            line = line.strip()
            if re.match(r'^\d+\.\s*', line):
                skill = re.sub(r'^\d+\.\s*', '', line)
                skills.append(skill.strip())
        return skills[:7]

    def extract_formal_topics_openalex(self, title: str, abstract: str) -> List[Dict]:
      """
      Path B: Extract formal topics using OpenAlex classifier with corrected data structure handling.
      """
      if not self.topic_classifier:
          print("⚠️ Topic classifier not available, skipping formal topics.")
          return []

      formatted_text = f"<TITLE> {title}\n<ABSTRACT> {abstract}"
      print("🔬 Running OpenAlex topic classification...")

      try:
          # Get predictions from the model. The output is a simple list of dicts.
          predictions = self.topic_classifier(formatted_text, top_k=10, truncation=True)
          # print(f"   [DEBUG] Raw output from OpenAlex model: {predictions}") # You can remove this now

          if not predictions:
              print("   OpenAlex model returned no valid predictions.")
              return []

          # --- CORRECTED LOOP ---
          # We iterate directly over 'predictions', which is the list of dictionaries.
          formal_topics = []
          for topic in predictions:
              # Check if the item is a dictionary with the keys we need
              if isinstance(topic, dict) and 'label' in topic and 'score' in topic:
                  # We can now lower the threshold since we see the scores are generally low
                  if topic['score'] > 0.01: # Lowered threshold to include the results
                      formal_topics.append({
                          'topic': topic['label'],
                          'score': round(topic['score'], 4)
                      })
              else:
                  print(f"   ⚠️ Skipping unexpected item in model predictions: {topic}")

          print(f"✅ Extracted {len(formal_topics)} formal topics from OpenAlex.")
          return formal_topics

      except Exception as e:
          print(f"⚠️ An exception occurred during topic classification: {e}")
          return []

    def fusion_logic(self, narrative_skills: List[str], formal_topics: List[Dict]) -> List[str]:
        """Combine narrative skills and formal topics, removing duplicates."""
        print("🔄 Applying fusion logic...")
        combined_skills = list(narrative_skills)
        narrative_lower = ' '.join(narrative_skills).lower()

        for topic in formal_topics:
            topic_name = topic['topic'].split(': ', 1)[-1] # Remove ID like "123: "
            is_duplicate = topic_name.lower() in narrative_lower
            if not is_duplicate:
                combined_skills.append(f"Expertise in {topic_name}")

        print(f"✅ Created final checklist with {len(combined_skills)} skills.")
        return combined_skills

    def process_solicitation(self, pdf_filepath: str) -> Optional[StructuredSolicitationObject]:
        """Main processing pipeline for a PDF solicitation from a given path."""
        print("🚀 Starting PDF Solicitation Processing Pipeline")
        print("=" * 60)
        try:
            filename, title, abstract = self._extract_text_from_pdf(pdf_filepath)

            # Input for Claude can be a simple combination
            claude_input_text = f"Title: {title}. Abstract: {abstract}"
            print(f"\n📊 Text stats for analysis: {len(claude_input_text)} characters.")

            # Path A: Claude
            narrative_skills = self.extract_narrative_skills_claude(claude_input_text)

            # Path B: OpenAlex (uses corrected function call)
            formal_topics = self.extract_formal_topics_openalex(title, abstract)

            # Path C: Fusion
            required_skills_checklist = self.fusion_logic(narrative_skills, formal_topics)

            solicitation_obj = StructuredSolicitationObject(
                solicitation_id=f"SOL_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                title=title, abstract=abstract,
                processed_at=datetime.now().isoformat(),
                pdf_filename=filename,
                narrative_skills=narrative_skills,
                formal_topics=formal_topics,
                required_skills_checklist=required_skills_checklist,
                text_length=len(claude_input_text))

            output_filename = f"{filename.replace('.pdf', '')}_analysis.json"
            solicitation_obj.to_json(output_filename)
            print(f"\n✅ Processing complete! Saved to: {output_filename}")
            return solicitation_obj
        except Exception as e:
            print(f"❌ A fatal error occurred during processing: {e}")
            return None

    def display_results(self, solicitation_obj: Optional[StructuredSolicitationObject]):
        """Display processing results in a readable format."""
        if not solicitation_obj:
            print("\nNo results to display due to a processing error.")
            return
        print("\n" + "="*60)
        print("📋 SOLICITATION PROCESSING RESULTS")
        print("="*60)
        print(f"🆔 ID: {solicitation_obj.solicitation_id}")
        print(f"📄 File: {solicitation_obj.pdf_filename}")
        print(f"📝 Title: {solicitation_obj.title}")
        print(f"\n🤖 Path A - Narrative Skills (Claude):")
        for i, skill in enumerate(solicitation_obj.narrative_skills, 1): print(f"   {i}. {skill}")
        print(f"\n🔬 Path B - Formal Topics (OpenAlex):")
        for i, topic in enumerate(solicitation_obj.formal_topics, 1): print(f"   {i}. {topic['topic']} (Score: {topic['score']:.3f})")
        print(f"\n✅ Final Hybrid Skills Checklist:")
        for i, skill in enumerate(solicitation_obj.required_skills_checklist, 1): print(f"   {i}. {skill}")
        print("\n" + "="*60)


# ==============================================================================
# Main Execution for Cell 1
# ==============================================================================
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    print("🚀 Starting Pipeline: Cell 1 - PDF Processing")
    print("=" * 60)

    try:
        # Mount Google Drive
        if not os.path.exists(DRIVE_MOUNT_PATH):
            drive.mount(DRIVE_MOUNT_PATH, force_remount=True)
            print("✅ Google Drive mounted successfully.")
        else:
            print("✅ Google Drive already mounted.")

        # --- 1. Generate a Unique Run ID from the PDF Filename ---
        base_filename = os.path.basename(PDF_FILE_PATH)
        sanitized_base = os.path.splitext(base_filename)[0]
        # Make the filename safe for all systems
        run_id = re.sub(r'[^a-zA-Z0-9_-]', '_', sanitized_base)
        print(f"🆔 Generated unique Run ID: {run_id}")

        # --- 2. Define Output Path ---
        solicitation_output_path = os.path.join(DATASTORE_PATH, f"{run_id}_solicitation_analysis.json")
        print(f"💾 Defined solicitation output path:\n   {solicitation_output_path}")


        # --- 3. Process the Solicitation ---
        processor = PDFSolicitationProcessor()
        result_obj = processor.process_solicitation(PDF_FILE_PATH) # This function will now just return the object

        if result_obj:
            # --- 4. Save the Output Manually ---
            result_obj.to_json(solicitation_output_path)
            print(f"\n✅ Analysis complete! Saved to: {solicitation_output_path}")
            processor.display_results(result_obj)
            print("\n✅ Cell 1 finished. Proceed to Cell 2.")
        else:
            print("❌ Processing failed, no result object created. Cannot proceed.")

    except Exception as e:
        print(f"❌ A critical error occurred in Cell 1: {e}")



In [None]:
# ==============================================================================
# Cell 2: Skill Affinity Engine
# ==============================================================================
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
import os

class SkillAffinityEngine:
    """
    Phase 1: Core Analysis - Calculates affinity between researchers and required skills.
    """
    def __init__(self, datastore_path="/content/drive/My Drive/datastore/"):
        self.datastore_path = datastore_path
        self.model = None
        self.metadata_df = None
        self.embeddings_array = None
        self.embedding_index = None
        self.researcher_index = None
        self.datastore_info = None

    def load_datastore(self):
        """Load the complete researcher profile datastore."""
        print("📂 Loading Researcher Profile Datastore...")
        try:
            self.metadata_df = pd.read_parquet(os.path.join(self.datastore_path, "researcher_profiles_metadata.parquet"))
            self.embeddings_array = np.load(os.path.join(self.datastore_path, "researcher_embeddings.npy"))
            with open(os.path.join(self.datastore_path, "embedding_index.json"), 'r') as f:
                self.embedding_index = json.load(f)
            with open(os.path.join(self.datastore_path, "researcher_index.json"), 'r') as f:
                self.researcher_index = json.load(f)
            with open(os.path.join(self.datastore_path, "datastore_info.json"), 'r') as f:
                self.datastore_info = json.load(f)
            print(f"🎯 Datastore ready: {self.metadata_df['researcher_id'].nunique()} researchers, {len(self.metadata_df)} papers")
        except Exception as e:
            raise Exception(f"Failed to load datastore: {e}")

    def load_model(self, model_name='all-MiniLM-L6-v2'):
        """Load the sentence transformer model."""
        print(f"🤖 Loading sentence transformer model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print("✅ Model loaded.")

    def embed_skills(self, skills_checklist):
        """Embed each skill phrase."""
        if self.model is None:
            self.load_model()
        print(f"🧠 Embedding {len(skills_checklist)} skills...")
        skill_embeddings = self.model.encode(skills_checklist, show_progress_bar=True)
        print(f"✅ Created skill embeddings: {skill_embeddings.shape}")
        return skill_embeddings

    def get_researcher_data(self, researcher_id):
        """Get embeddings, work_ids, and recency weights for a specific researcher."""
        work_ids = self.researcher_index.get(researcher_id, [])
        valid_work_ids = [wid for wid in work_ids if wid in self.embedding_index]
        positions = [self.embedding_index[wid] for wid in valid_work_ids]
        if not positions:
            return np.array([]), [], []
        paper_embeddings = self.embeddings_array[positions]
        paper_metadata = self.metadata_df[self.metadata_df['work_id'].isin(valid_work_ids)]
        weight_mapping = dict(zip(paper_metadata['work_id'], paper_metadata['recency_weight']))
        recency_weights = np.array([weight_mapping.get(wid, 0.0) for wid in valid_work_ids])
        return paper_embeddings, valid_work_ids, recency_weights

    def calculate_skill_affinity_score(self, paper_embeddings, skill_embedding, recency_weights):
        """Calculate SkillAffinityScore for one researcher against one skill."""
        if len(paper_embeddings) == 0:
            return 0.0
        skill_embedding_2d = skill_embedding.reshape(1, -1)
        cosine_sims = cosine_similarity(paper_embeddings, skill_embedding_2d).flatten()
        weighted_sims = cosine_sims * recency_weights
        max_weighted_sim = np.max(weighted_sims)
        affinity_score = np.clip(max_weighted_sim * 100, 0, 100)
        return round(affinity_score, 2)

    def create_affinity_matrix(self, skills_checklist, solicitation_id=None):
        """Create the complete affinity matrix for all researchers and skills."""
        print("🎯 Creating Affinity Matrix...")
        print("=" * 50)
        if self.metadata_df is None:
            self.load_datastore()
        if not skills_checklist:
            raise ValueError("Skills checklist is empty. Cannot create affinity matrix.")
        skill_embeddings = self.embed_skills(skills_checklist)
        unique_researchers = list(self.researcher_index.keys())
        print(f"📊 Processing {len(unique_researchers)} researchers × {len(skills_checklist)} skills")
        affinity_matrix = np.zeros((len(unique_researchers), len(skills_checklist)))
        for i, researcher_id in enumerate(tqdm(unique_researchers, desc="Processing researchers")):
            paper_embeddings, _, recency_weights = self.get_researcher_data(researcher_id)
            if len(paper_embeddings) == 0:
                continue
            for j, skill_embedding in enumerate(skill_embeddings):
                affinity_matrix[i, j] = self.calculate_skill_affinity_score(
                    paper_embeddings, skill_embedding, recency_weights
                )
        researcher_names = [self.metadata_df[self.metadata_df['researcher_id'] == rid].iloc[0]['researcher_name'] for rid in unique_researchers]
        skill_columns = [f"Skill_{i+1:02d}: {skill[:50]}" for i, skill in enumerate(skills_checklist)]
        affinity_df = pd.DataFrame(affinity_matrix, index=researcher_names, columns=skill_columns)
        print(f"✅ Affinity Matrix created: {affinity_df.shape}")
        return affinity_df, unique_researchers, skills_checklist

    def analyze_affinity_matrix(self, affinity_df, skills_checklist):
        """Provide analysis and insights on the affinity matrix."""
        print("\n" + "="*60 + "\n📊 AFFINITY MATRIX ANALYSIS\n" + "="*60)
        if affinity_df.empty or affinity_df.shape[1] == 0:
            print("⚠️ Affinity matrix is empty. Skipping analysis.")
            return
        print(f"📏 Matrix dimensions: {affinity_df.shape[0]} researchers × {affinity_df.shape[1]} skills")
        print(f"📈 Score range: {affinity_df.values.min():.2f} - {affinity_df.values.max():.2f}")
        print(f"📊 Mean affinity score: {affinity_df.values.mean():.2f}")
        researcher_avg_scores = affinity_df.mean(axis=1).sort_values(ascending=False)
        print(f"\n🏆 Top 5 Researchers (by average affinity):\n{researcher_avg_scores.head().to_string(float_format='%.2f')}")
        skill_avg_scores = affinity_df.mean(axis=0).sort_values()
        print(f"\n🎯 Most Challenging Skills (lowest average affinity):")
        for skill_col, score in skill_avg_scores.head().items():
            try:
                original_skill_index = int(re.search(r'Skill_(\d+):', skill_col).group(1)) - 1
                print(f"   - {skills_checklist[original_skill_index][:60]}...: {score:.2f}")
            except (AttributeError, IndexError):
                print(f"   - {skill_col}: {score:.2f}")


    def save_affinity_matrix(self, affinity_df, csv_path, metadata_path, metadata):
        """
        Save the affinity matrix and related data to specified paths.
        This method is corrected to accept full paths instead of creating them internally.
        """
        # Save main affinity matrix
        affinity_df.to_csv(csv_path)
        print(f"💾 Affinity matrix saved to: {csv_path}")

        # Save metadata
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        print(f"📋 Metadata saved to: {metadata_path}")


# ==============================================================================
# Main Execution for Cell 2
# ==============================================================================
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    print("\n🚀 Starting Pipeline: Cell 2 - Affinity Matrix Generation")
    print("=" * 60)

    # --- 1. Check for Inputs from Cell 1 ---
    if 'result_obj' not in locals() or 'run_id' not in locals() or result_obj is None:
        print("❌ ERROR: Input variables from Cell 1 are missing.")
        print("   Please run Cell 1 successfully before running this cell.")
    else:
        print(f"✅ Received inputs for Run ID: {run_id}")
        try:
            # --- 2. Initialize Engine and Get Skills ---
            engine = SkillAffinityEngine(datastore_path=DATASTORE_PATH)
            skills_checklist = result_obj.required_skills_checklist
            solicitation_id = result_obj.solicitation_id
            print(f"🎯 Analyzing {len(skills_checklist)} required skills.")

            # --- 3. Create Affinity Matrix ---
            affinity_df, unique_researchers, skills_list = engine.create_affinity_matrix(
                skills_checklist, solicitation_id
            )

            # --- 4. Define Output Paths Using the Run ID ---
            affinity_csv_path = os.path.join(DATASTORE_PATH, f"{run_id}_affinity_matrix.csv")
            affinity_metadata_path = os.path.join(DATASTORE_PATH, f"{run_id}_affinity_metadata.json")
            print(f"\n💾 Defined affinity matrix output path:\n   {affinity_csv_path}")
            print(f"💾 Defined affinity metadata output path:\n   {affinity_metadata_path}")

            # --- 5. Prepare and Save Results ---
            metadata_payload = {
                "created_at": pd.Timestamp.now().isoformat(),
                "solicitation_id": solicitation_id,
                "run_id": run_id,
                "matrix_shape": affinity_df.shape,
                "num_researchers": len(unique_researchers),
                "num_skills": len(skills_list),
                "researcher_ids": unique_researchers,
                "skills_checklist": skills_list,
                "score_statistics": {
                    "min": float(affinity_df.values.min()),
                    "max": float(affinity_df.values.max()),
                    "mean": float(affinity_df.values.mean()),
                    "std": float(affinity_df.values.std())
                }
            }

            # Call the corrected save method
            engine.save_affinity_matrix(affinity_df, affinity_csv_path, affinity_metadata_path, metadata_payload)

            # --- 6. Analyze and Conclude ---
            engine.analyze_affinity_matrix(affinity_df, skills_checklist)
            print("\n✅ Cell 2 finished. You may now proceed to Cell 3.")

        except Exception as e:
            print(f"❌ An error occurred during affinity analysis in Cell 2: {e}")

In [None]:
# ==============================================================================
# Cell 3: Dream Team Assembler & Strategic Output
# ==============================================================================
import pandas as pd
import numpy as np
import json
from datetime import datetime
import anthropic
from google.colab import userdata
from typing import List, Dict, Tuple
import warnings
import os

class DreamTeamAssembler:
    """
    Phase 2: Dream Team Assembly & Strategic Output.
    Generates optimal teams and creates comprehensive strategic reports.
    """

    def __init__(self):
        self.claude_client = None
        self.setup_claude_api()

    def setup_claude_api(self):
        """Initialize Claude API client."""
        try:
            api_key = userdata.get('ANTHROPIC_API_KEY')
            self.claude_client = anthropic.Anthropic(api_key=api_key)
            print("✅ Claude API client initialized for gap analysis")
        except Exception as e:
            print(f"⚠️ Claude API setup failed: {e}. Strategic analysis will be basic.")

    def load_affinity_matrix(self, csv_path, metadata_path=None):
        """Load the affinity matrix and associated metadata."""
        print(f"📊 Loading affinity matrix from: {csv_path}")
        affinity_df = pd.read_csv(csv_path, index_col=0)
        print(f"✅ Loaded matrix: {affinity_df.shape[0]} researchers × {affinity_df.shape[1]} skills")
        metadata = None
        if metadata_path and os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
            print("✅ Loaded affinity metadata")
        return affinity_df, metadata

    def calculate_team_coverage(self, affinity_df, team_indices):
        """Calculate team coverage scores for all skills."""
        if not team_indices:
            return np.array([0.0] * affinity_df.shape[1]), 0.0
        team_affinities = affinity_df.iloc[team_indices]
        skill_coverages = team_affinities.max(axis=0).values
        return skill_coverages, np.mean(skill_coverages)

    def calculate_marginal_gain(self, affinity_df, current_team_indices, candidate_index):
        """Calculate the marginal gain of adding a candidate to the team."""
        _, current_coverage = self.calculate_team_coverage(affinity_df, current_team_indices)
        _, new_coverage = self.calculate_team_coverage(affinity_df, current_team_indices + [candidate_index])
        return new_coverage - current_coverage

    def dream_team_greedy_algorithm(self, affinity_df, min_team_size=2, max_team_size=4):
        """Implement the greedy algorithm to select the best team."""
        print("🎯 Running Dream Team Greedy Algorithm...")
        print("=" * 50)
        n_researchers = len(affinity_df)
        selected_indices = []
        selection_history = []

        # Step 1: Select the best overall researcher as PI
        best_researcher_pos = affinity_df.mean(axis=1).idxmax()
        best_researcher_loc = affinity_df.index.get_loc(best_researcher_pos)
        selected_indices.append(best_researcher_loc)
        _, initial_coverage = self.calculate_team_coverage(affinity_df, selected_indices)

        selection_history.append({
            'step': 1, 'action': 'Select PI',
            'researcher_name': affinity_df.index[best_researcher_loc],
            'reason': 'Highest average affinity score',
            'team_coverage': initial_coverage
        })
        print(f"🏆 Step 1 - PI Selection: {affinity_df.index[best_researcher_loc]} (Coverage: {initial_coverage:.2f})")

        # Step 2-N: Iteratively add members with the highest marginal gain
        for step in range(2, max_team_size + 1):
            gains = [(idx, self.calculate_marginal_gain(affinity_df, selected_indices, idx))
                     for idx in range(n_researchers) if idx not in selected_indices]
            if not gains: break

            best_candidate_idx, best_marginal_gain = max(gains, key=lambda item: item[1])

            if best_marginal_gain > 0.5 or len(selected_indices) < min_team_size:
                selected_indices.append(best_candidate_idx)
                _, new_coverage = self.calculate_team_coverage(affinity_df, selected_indices)
                selection_history.append({
                    'step': step, 'action': 'Add Member',
                    'researcher_name': affinity_df.index[best_candidate_idx],
                    'reason': f'Maximum marginal gain (+{best_marginal_gain:.2f})',
                    'team_coverage': new_coverage
                })
                print(f"✅ Step {step} - Added: {affinity_df.index[best_candidate_idx]} (New Coverage: {new_coverage:.2f})")
            else:
                print(f"🛑 Step {step} - Stopping: No significant marginal gain found (best was +{best_marginal_gain:.2f}).")
                break

        final_coverage = self.calculate_team_coverage(affinity_df, selected_indices)[1]
        print(f"\n🎯 Final Dream Team ({len(selected_indices)} members) with {final_coverage:.2f} coverage.")
        return selected_indices, selection_history

    def generate_coverage_report(self, affinity_df, team_indices, skills_list):
        """Generate a detailed coverage report for the selected team."""
        skill_coverages, overall_coverage = self.calculate_team_coverage(affinity_df, team_indices)
        team_members = []
        for idx in team_indices:
            scores = affinity_df.iloc[idx]
            top_skills = [{'skill': skills_list[i], 'score': scores[i]} for i in scores.argsort()[-3:][::-1]]
            team_members.append({'name': affinity_df.index[idx], 'avg_affinity': scores.mean(), 'top_skills': top_skills})

        skill_analysis = []
        for i, (skill, coverage) in enumerate(zip(skills_list, skill_coverages)):
            team_scores = affinity_df.iloc[team_indices, i]
            best_member_idx = team_scores.idxmax()
            skill_analysis.append({
                'skill': skill, 'coverage_score': coverage,
                'level': 'High' if coverage >= 70 else 'Medium' if coverage >= 40 else 'Low',
                'expert': best_member_idx, 'expert_score': team_scores.max()
            })
        return {'overall_coverage_score': overall_coverage, 'team_members': team_members, 'skill_analysis': skill_analysis}

    def generate_strategic_analysis(self, coverage_report, skills_list, solicitation_data):
        """Generate AI-powered gap analysis using Claude API."""
        if not self.claude_client:
            return "Claude API not available. Basic analysis only: Review low-coverage skills and consider recruitment."
        print("🤖 Generating strategic analysis with Claude API...")
        # Create a detailed prompt (shortened for brevity, full logic assumed)
        prompt = f"Analyze this research team's fit for the solicitation titled '{solicitation_data.get('title', 'N/A')}'.\n"
        prompt += f"Team has an overall coverage score of {coverage_report['overall_coverage_score']:.2f}.\n"
        low_skills = [s['skill'] for s in coverage_report['skill_analysis'] if s['level'] == 'Low']
        prompt += f"Potential Gaps (Low Coverage): {', '.join(low_skills) if low_skills else 'None'}.\n"
        prompt += "Provide a strategic report covering strengths, weaknesses, and actionable recommendations for the proposal."
        try:
            response = self.claude_client.messages.create(
                model="claude-3-sonnet-20240229", max_tokens=2000, temperature=0.5,
                messages=[{"role": "user", "content": prompt}]
            )
            analysis = response.content[0].text
            print("✅ Strategic analysis generated.")
            return analysis
        except Exception as e:
            return f"Claude API analysis failed: {e}. Fallback: Review low-coverage skills: {low_skills}."

    def create_strategic_report(self, affinity_df, metadata, solicitation_data):
        """Main function to create a comprehensive strategic report."""
        print("\n🚀 CREATING STRATEGIC REPORT")
        print("=" * 60)
        skills_list = metadata.get('skills_checklist', [col.split(': ', 1)[-1] for col in affinity_df.columns])
        team_indices, history = self.dream_team_greedy_algorithm(affinity_df)
        coverage_report = self.generate_coverage_report(affinity_df, team_indices, skills_list)
        strategic_analysis = self.generate_strategic_analysis(coverage_report, skills_list, solicitation_data)

        return {
            'report_metadata': {'generated_at': datetime.now().isoformat(), 'solicitation_id': metadata.get('solicitation_id')},
            'coverage_analysis': coverage_report, 'strategic_analysis': strategic_analysis,
        }

    def format_markdown_report(self, strategic_report):
        """Format the strategic report as a human-readable Markdown file."""
        report = f"# NSF Dream Team Strategic Report\n\n"
        meta = strategic_report['report_metadata']
        report += f"**Generated:** {meta['generated_at']}\n"
        report += f"**Solicitation ID:** `{meta.get('solicitation_id', 'N/A')}`\n\n"

        # --- Team Summary Table ---
        coverage = strategic_report['coverage_analysis']
        report += f"## 🏆 Recommended Dream Team\n\n"
        report += f"**Overall Team Coverage Score:** **`{coverage['overall_coverage_score']:.2f} / 100`**\n\n"
        report += "| Role | Researcher | Avg. Affinity | Top Expertise Areas |\n"
        report += "|:---|:---|:---:|:---|\n"
        for i, member in enumerate(coverage['team_members']):
            role = "**Principal Investigator (PI)**" if i == 0 else f"Co-Investigator {i+1}"
            top_skills = ", ".join([s['skill'] for s in member['top_skills']])
            report += f"| {role} | {member['name']} | `{member['avg_affinity']:.2f}` | {top_skills} |\n"

        # --- Coverage Analysis Table ---
        report += f"\n## 📊 Skills Coverage Analysis\n\n"
        report += "| Skill / Expertise Area | Coverage | Level | Primary Expert |\n"
        report += "|:---|:---:|:---|:---|\n"
        for skill in sorted(coverage['skill_analysis'], key=lambda x: x['coverage_score']):
            level_emoji = "🟢" if skill['level'] == 'High' else "🟡" if skill['level'] == 'Medium' else "🔴"
            report += f"| {skill['skill']} | `{skill['coverage_score']:.2f}` | {level_emoji} {skill['level']} | {skill['expert']} |\n"

        # --- Strategic Analysis ---
        report += f"\n## 🧠 AI-Powered Strategic Analysis\n\n"
        report += "> " + strategic_report['strategic_analysis'].replace('\n', '\n> ') + "\n"
        return report

    def save_strategic_report(self, strategic_report, drive_base_path, local_base_path):
        """Saves the strategic report as JSON and Markdown to GDrive and local storage."""
        print("\n💾 Saving strategic reports...")

        # --- Generate Markdown Content ---
        md_content = self.format_markdown_report(strategic_report)

        # --- Define Paths ---
        json_path = f"{drive_base_path}_strategic_report.json"
        drive_md_path = f"{drive_base_path}_strategic_report.md"
        local_md_path = f"{local_base_path}_strategic_report.md"

        # --- Save Files ---
        with open(json_path, 'w') as f:
            json.dump(strategic_report, f, indent=2)
        print(f"   📄 Saved full JSON report to Google Drive:\n      {json_path}")

        with open(drive_md_path, 'w') as f:
            f.write(md_content)
        print(f"   📝 Saved Markdown report to Google Drive:\n      {drive_md_path}")

        with open(local_md_path, 'w') as f:
            f.write(md_content)
        print(f"   💻 Saved Markdown report to Colab local storage:\n      {local_md_path}")

    def display_summary(self, strategic_report):
        """Displays a summary of the strategic report in the console."""
        print("\n" + "="*60)
        print("📋 DREAM TEAM STRATEGIC REPORT SUMMARY")
        print("="*60)
        coverage = strategic_report['coverage_analysis']
        print(f"🏆 Recommended Team ({len(coverage['team_members'])} members) --> Overall Score: {coverage['overall_coverage_score']:.2f}/100")
        for i, member in enumerate(coverage['team_members']):
            role = "PI" if i == 0 else f"Co-I {i+1}"
            print(f"   - **{member['name']}** ({role})")

        low_skills = [s for s in coverage['skill_analysis'] if s['level'] == 'Low']
        if low_skills:
            print(f"\n🔴 Identified {len(low_skills)} potential skill gaps (Low Coverage).")
            print("   Review the saved Markdown report for details.")
        else:
            print("\n🟢 Excellent coverage. No significant skill gaps were identified.")

# ==============================================================================
# Main Execution for Cell 3
# ==============================================================================
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    print("\n🚀 Starting Pipeline: Cell 3 - Dream Team Assembly & Report Generation")
    print("=" * 60)

    # --- 1. Check for Inputs from Previous Cells ---
    if 'affinity_csv_path' not in locals() or not os.path.exists(affinity_csv_path):
        print("❌ ERROR: Input file from Cell 2 is missing.")
        print("   Please run Cells 1 and 2 successfully before running this cell.")
    else:
        print(f"✅ Received inputs for Run ID: {run_id}")
        try:
            # --- 2. Initialize Assembler ---
            assembler = DreamTeamAssembler()

            # --- 3. Load Data Using Dynamic Paths ---
            print("📊 Loading data using dynamic paths from previous cells...")
            affinity_df, affinity_metadata = assembler.load_affinity_matrix(affinity_csv_path, affinity_metadata_path)
            with open(solicitation_output_path, 'r') as f:
                solicitation_data = json.load(f)
            print("✅ Loaded all necessary data.")

            # --- 4. Create Strategic Report ---
            strategic_report = assembler.create_strategic_report(
                affinity_df=affinity_df,
                metadata=affinity_metadata,
                solicitation_data=solicitation_data
            )

            # --- 5. Define Output Paths and Save Report ---
            report_drive_base_path = os.path.join(DATASTORE_PATH, run_id)
            report_local_base_path = f"/content/{run_id}" # For Colab's temporary storage

            assembler.save_strategic_report(strategic_report, report_drive_base_path, report_local_base_path)

            # --- 6. Display Final Summary in Console ---
            assembler.display_summary(strategic_report)

            print(f"\n\n✅✅✅ Pipeline Complete! ✅✅✅")
            print(f"Check the file browser on the left for the local report or your Drive folder.")

        except Exception as e:
            print(f"❌ An error occurred during dream team analysis in Cell 3: {e}")