In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import json
import random
import logging

from pyvis.network import Network
from openai import OpenAI
from dotenv import load_dotenv

In [None]:
datetime = pd.to_datetime('today').strftime('%Y-%m-%d-%H-%M-%S').replace(" ", "_").replace(":", "_")
logfile_name = f"./logs/{datetime}.log"

logging.basicConfig(filename=logfile_name, level=logging.INFO, force=True) 

df = pd.read_csv('./data/faculty_data.csv', encoding='ISO-8859-1')

df = df.rename(columns={'Name': 'name',
                   'School': "school",
                    'Degree Program': "degree_program",
                    'UGA Affiliations (e.g. Centers or Institutes etc.)': "uga_affiliations",
                    'Previous Instutution(s)': "previous_institutions",
                    'PhD. Degree': "phd_degree",
                    'Interdisciplinary Areas': "interdisciplinary_areas",
                    'Broad Speacialty Areas / Expertise': "broad_specialties",
                    'Research Keywords': "research_keywords",
                    'Major Tool / Equipment': "equipment",
                    'Potential Sponsors': "potential_sponsors",
                    'UGA Collaborator(s)': "uga_collaborators",
                    'Outside Collaborator(s)': "outside_collaborators",
                    'Global Engagement': "global_engagement",
                    'Memberships': "memberships",
                    'Other Information': "other"
                })

df['name'] = df['name'].ffill()
df['name'] = df['name'].str.strip()
df = df.groupby('name').agg(lambda x: ', '.join(x.dropna().astype(str)))

def combine_entries(series):
    if series.dtype == 'object':
        return ', '.join(set(', '.join(series.dropna()).split(', ')))
    else:
        return series.dropna().iloc[0]

df = df.groupby('name').agg(combine_entries).reset_index()

gen_df = df[["name", "interdisciplinary_areas", "broad_specialties"]]
json_data = gen_df.to_json(orient="records", indent=4)

print(df.columns)

In [None]:
load_dotenv()

API_KEY = os.getenv("API_KEY")

client = OpenAI(
  api_key=API_KEY
)


prompt = f"""
    You are categorizing engineering professors at the University of Georgia based on their interdisciplinary research areas and broad specialties.

    ## **Task Requirements**
    1. **You must group all professors into exactly 4-5 broad categories.**  
        - Example categories:  
            - "AI, Data Science, and Cyber-Physical Systems"  
            - "Biomedical and Health Engineering"  
            - "Energy, Environment, and Sustainability"  
            - "Materials, Manufacturing, and Robotics"  
            - "Education, Policy, and Social Impact in Engineering"  
        - **Categories must remain broad and standardized** while maintaining meaningful research connections.  
        - You may **slightly adjust** the names but **must not exceed 5 groups**.  

    2. **Strict Assignment Rules:**  
        - **Every professor must be assigned to one and only one category.**  
        - If a professor's research spans multiple areas, **assign them to the closest matching category.**  
        - If a professor's research data is completely empty, assign them `""` (empty string).  

    3. **Category Balance Enforcement:**  
        - If a category has **only one professor**, merge it with the most related category.  
        - If a category has an excessive number of professors, **split only if absolutely necessary** (and still ensure a max of 5 groups).  
        - The **final output must always contain 4-5 categories**, no more, no less.  

    4. **STRICT JSON Output Format:**  
        Your response **MUST BE STRICTLY JSON** and match the format below:  
    ```json
    {{
        "insight": "Some professors had highly interdisciplinary research, requiring careful classification.",
        "generated_disciplines": [
            {{
                "name": "Professor A",
                "discipline": "AI, Data Science, and Cyber-Physical Systems"
            }},
            {{
                "name": "Professor B",
                "discipline": "Biomedical and Health Engineering"
            }},
            {{
                "name": "Professor C",
                "discipline": "Materials, Manufacturing, and Robotics"
            }},
            ...
        ]
    }}
    ```

    **Deviating from this format will be considered incorrect output.**  

    ## **Input JSON Data**
    ```json
    {json_data}
    ```
"""


# First prompt (categorization of professors into disciplines)
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[{
      "role": "user", 
      "content": prompt  # Original categorization prompt
  }]
)

raw_content = completion.choices[0].message.content.strip()

if raw_content.startswith("```json"):
    raw_content = raw_content[7:-3]

data = json.loads(raw_content)
logging.info(data["insight"])
disciplines_df = pd.DataFrame(data["generated_disciplines"])
disciplines_df.set_index("name", inplace=True)
disciplines_df.index = disciplines_df.index.str.strip()


In [None]:
df_json = df.to_json(orient="records", indent=4)

second_prompt = f"""
    You are tasked with categorizing engineering professors at the University of Georgia based on their **relatedness to other professors**. Your goal is to **group professors into strictly defined clusters** based on shared collaborators, research areas, disciplines, and institutional affiliations.

    ## **Instructions**
    1. **You must calculate connection strength using only the following columns:**
        - **'uga_collaborators'** (Shared UGA collaborators)
        - **'outside_collaborators'** (Shared external collaborators)
        - **'interdisciplinary_areas'** (Shared interdisciplinary areas)
        - **'broad_specialties'** (Shared broad specialties)
        - **'generated_disciplines'** (Identical generated discipline)
        - **'overlapping_expertise'** (Number of overlapping expertise)
        - **'current_affiliation'** (Same or highly related affiliation)
        - **'school'** (Same or highly related school)

    2. **Connection weights MUST be assigned using the following STRICT formula:**
        - **Overlapping Expertise (Weight: 3 per shared expertise)** → Example: If two professors share 2 overlapping expertise areas, weight = **6.0**.
        - **Shared UGA Collaborators (Weight: 2 per collaborator)** → Example: If two professors share 3 UGA collaborators, weight = **6.0**.
        - **Shared Outside Collaborators (Weight: 1.5 per collaborator)** → Example: If two professors share 3 outside collaborators, weight = **4.5**.
        - **Shared Interdisciplinary Areas (Weight: 1 per area)** → Example: If two professors share 2 interdisciplinary areas, weight = **2.0**.
        - **Shared Broad Specialties (Weight: 1 per specialty)** → Example: If two professors share 3 broad specialties, weight = **3.0**.
        - **Same Generated Discipline (Weight: 2 if True, 0 if False)**.
        - **Same Current Affiliation / Degree Program (Weight: 0.5 if True, 0 if False)**.
        - **Same School (Weight: 0.3 if True, 0 if False)**.
        - **TOTAL connection strength is the sum of all applicable weights. The final value MUST be rounded to one decimal place.**

    3. **Ensure consistency in reasoning by explicitly listing column contributions for each connection.**  
    **Do NOT generate random relationships**—only professors with nonzero connection weight should be included in the output.

    ## **Example of the Required JSON Output Format**
    Your response **MUST ONLY** be a JSON object in this format:
    ```json
    {{
        "insight": "Some professors had no strong connections due to lack of collaborators or overlapping expertise.",
        "generated_groups": [
            {{
                "name": "Professor A",
                "related_professors": [
                    {{
                        "name": "Professor B",
                        "weight": 6.5,
                        "reasoning": {{
                            "uga_collaborators": 2,
                            "outside_collaborators": 1,
                            "interdisciplinary_areas": 1,
                            "broad_specialties": 0,
                            "generated_disciplines": true,
                            "overlapping_expertise": 0,
                            "current_affiliation": false,
                            "school": false
                        }}
                    }},
                    {{
                        "name": "Professor C",
                        "weight": 3.0,
                        "reasoning": {{
                            "uga_collaborators": 0,
                            "outside_collaborators": 0,
                            "interdisciplinary_areas": 0,
                            "broad_specialties": 0,
                            "generated_disciplines": true,
                            "overlapping_expertise": 0,
                            "current_affiliation": false,
                            "school": false
                        }}
                    }}
                ]
            }},
            ...
        ]
    }}
    ```
    **Failure to follow this format exactly will result in incorrect output. Do not deviate from these instructions.**

    ## **Input Data**
    ```json
    {df_json}
    ```
"""

# Get response for second prompt (relatedness)
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[{
      "role": "user", 
      "content": second_prompt  # Original categorization prompt
  }]
)
raw_content = completion.choices[0].message.content.strip()

if raw_content.startswith("```json"):
    raw_content = raw_content[7:-3]

relatedness_data = json.loads(raw_content)

# Now create DataFrame
relatedness_df = pd.DataFrame(relatedness_data["generated_groups"])
relatedness_df = relatedness_df.reset_index(drop=True).set_index('name')

# Convert 'related_professors' to a list of tuples (name, weight)
relatedness_df["related_professors"] = relatedness_df["related_professors"].apply(
    lambda x: [(rel["name"], float(rel["weight"])) for rel in x]
)

relatedness_df.index = relatedness_df.index.str.strip()
logging.info(relatedness_data["insight"])

print(relatedness_df)

for group in relatedness_data['generated_groups']:
    for professor in group['related_professors']:
        reasoning = professor['reasoning']
        logging.info(f"Reasoning for {professor['name']}:\n{reasoning}\n")

logging.info(relatedness_df["related_professors"])

In [None]:
# Merge the dataframes
df = df.reset_index(drop=True).set_index('name')
df = df.merge(disciplines_df, left_index=True, right_index=True, how="left")
df = df.merge(relatedness_df, left_index=True, right_index=True, how="left")

# Clear all blank strings
# df.replace("", np.nan, inplace=True)

In [None]:
search_professor = True
category_search_parameter = "discipline"
professor_search_parameter = "Beiwen Li"
min_weight = 0.2

In [None]:
df.to_csv('csv_test_3.csv', index=False)

In [None]:
import math

if search_professor:
    # Find the row corresponding to the professor_search_parameter
    professor_data = df[df.index == professor_search_parameter]

    if not professor_data.empty:
        # Extract the professor's related professors and their weights
        related_professors = professor_data['related_professors'].values[0]
        
        # Create a graph for the specific professor and their related professors
        G = nx.Graph()
        
        # Add the initial professor as the first node
        G.add_node(professor_search_parameter)
        
        # Define a set to track all nodes (professors) to avoid duplicates
        added_nodes = {professor_search_parameter}
        
        # Define a set to track which professors we have processed in BFS
        visited = set([professor_search_parameter])
        
        # Perform BFS for 1st, 2nd, and further layers
        layer_queue = [(professor_search_parameter, related_professors)]  # (professor, related_professors)
        
        # For each layer, we'll expand the related professors
        while layer_queue:
            current_layer = layer_queue.pop(0)  # Get the next professor and their related professors
            current_professor, related_professors = current_layer
            
            if type(related_professors) != list:
                if math.isnan(related_professors):
                    continue
            
            # Add all related professors from this layer
            for related_professor, weight in related_professors:
                if min_weight > weight:
                        continue    
                
                if related_professor not in added_nodes:
                    G.add_node(related_professor)
                    G.add_edge(current_professor, related_professor, weight=weight)
                    added_nodes.add(related_professor)
                    visited.add(related_professor)  # Mark as visited

                    # Add all related professors of the current professor to the queue for the next layer
                    if related_professor in df.index:
                        next_layer_professors = df.loc[related_professor, 'related_professors']

                        print(next_layer_professors)

                        if type(next_layer_professors) != list:
                            if math.isnan(next_layer_professors):
                                continue

                        # Only add to the queue if this professor hasn't been visited already
                        for next_related_professor, _ in next_layer_professors:                            
                            if next_related_professor not in visited:
                                layer_queue.append((related_professor, next_layer_professors))

        for edge in G.edges(data=True):
            edge[2]['label'] = str(edge[2]['weight'])
            edge[2]['title'] = str(edge[2]['weight'])
            edge[2]['font'] = {'size': 12, 'weight': 'normal', 'color': 'white', 'strokeWidth': 0.}  # Adjust font size, weight, and color

        
        # Use spring_layout with fixed edge length
        pos = nx.spring_layout(G, seed=42)

        node_colors = ["pink" if node == professor_search_parameter else "lightgreen" for node in G.nodes]

        # Draw the graph with better edge styling
        nx.draw(
            G, pos, with_labels=True, node_color=node_colors, node_size=1500,
            font_size=12, font_weight='normal', edge_color='gray', width=2,
            alpha=0.7, edgecolors="black"
        )
        
        # Draw the edge labels (weights)
        edge_labels = nx.get_edge_attributes(G, 'weight')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10)

        nt = Network(height="750px", width="100%", bgcolor="#222222", font_color="white", select_menu=True, filter_menu=True)
        #nt.show_buttons(filter_=['physics'])

        nt.set_options("""
        const options = {
            "nodes": {
                "color": {
                    "background": "lightgreen",
                    "border": "white",
                    "highlight": {
                        "background": "pink",
                        "border": "pink"
                    },
                    "hover": {
                        "background": "yellow",
                        "border": "white"
                    }
                },
                "font": {
                    "color": "white"
                }
            },
            "edges": {
                "color": {
                    "color": "gray",
                    "highlight": "pink"
                },
                "width": 2,
                "smooth": {
                    "type": "curvedCW",
                    "roundness": 0.2
                }
            },
            "interaction": {
                "hover": true,
                "selectConnectedEdges": true
            },
            "physics": {
                "barnesHut": {
                    "theta": 0.35,
                    "damping": 0.41,
                    "avoidOverlap": 0.1
                },
                "minVelocity": 0.75
            }
        }
        """)



        nt.from_nx(G)
        
        
        nt.show(f'{professor_search_parameter.replace(" ", "_")}.html', notebook=False)

        #nt.show('nx.html')

        # Title with the professor's name
        #plt.title(f"Professor: {professor_search_parameter} and Related Professors", fontsize=14)
        
        # Adjust layout and remove tight_layout to avoid the warning
        #plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
        #plt.show()

    else:
        print(f"Professor '{professor_search_parameter}' not found in the dataset.")
