In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
phd_df = pd.read_csv(r'G:\Unidades compartidas\Planetary Wellbeing Mapping at UPF\Dataset\updated_datasets\updated_phd_dataset.csv')
phd_df['field'] = phd_df['field'].replace('Social Sciences', 'Social_Sciences')
phd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3163 entries, 0 to 3162
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Author       3163 non-null   object 
 1   Supervisor   3160 non-null   object 
 2   Departament  3163 non-null   object 
 3   Date         3163 non-null   object 
 4   Abstract     3157 non-null   object 
 5   Handle       3163 non-null   object 
 6   Language     3163 non-null   object 
 7   Keyword      3163 non-null   object 
 8   Title        3163 non-null   object 
 9   topic_label  3163 non-null   object 
 10  topic_score  3163 non-null   float64
 11  topic_index  3163 non-null   int64  
 12  subfield     3163 non-null   object 
 13  field        3163 non-null   object 
 14  domain       3163 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 370.8+ KB


In [4]:
import pandas as pd
import plotly.graph_objects as go
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

# Create Dash app
app = dash.Dash(__name__)

# Load your DataFrame, assuming phd_df is your DataFrame

# Define the departments available for selection
departments = phd_df['Departament'].unique()

# Create the layout of the app
# Define the options for top N selection
top_n_options = [5, 10, 15]  # You can adjust this list as needed

# Create the layout of the app
app.layout = html.Div([
    dcc.Dropdown(
        id='department-dropdown',
        options=[{'label': dept, 'value': dept} for dept in departments],
        value=departments[0]  # Default value
    ),
    dcc.Slider(
        id='top-n-slider',
        min=5,
        max=1000,
        step=10,
        value=10,  # Default value
        #marks={i: str(i) for i in range(100, 1001)}  # Slider marks
    ),
    dcc.Graph(id='sankey-diagram')
])

# Callback to update the Sankey diagram based on the selected department and top N values
@app.callback(
    Output('sankey-diagram', 'figure'),
    [Input('department-dropdown', 'value'),
     Input('top-n-slider', 'value')]
)
def update_sankey(selected_department,top_n):
    # Filter data based on selected department
    filtered_df = phd_df[phd_df['Departament'] == selected_department]

    # Aggregate data
    agg_data = filtered_df.groupby(['domain', 'field', 'subfield', 'topic_label']).size().reset_index(name='counts')

    # Limiting the number of nodes for simplicity (You can adjust this number)
    
    top_domains = agg_data.groupby('domain')['counts'].sum().nlargest(top_n).index
    top_fields = agg_data.groupby('field')['counts'].sum().nlargest(top_n).index
    top_subfields = agg_data.groupby('subfield')['counts'].sum().nlargest(top_n).index
    top_topics = agg_data.groupby('topic_label')['counts'].sum().nlargest(top_n).index

    # Filter aggregated data
    agg_data = agg_data[agg_data['domain'].isin(top_domains) & agg_data['field'].isin(top_fields) & agg_data['subfield'].isin(top_subfields) & agg_data['topic_label'].isin(top_topics)]
    # Assuming agg_data is your aggregated DataFrame

    # Unique list of values while maintaining the order
    unique_domains = agg_data['domain'].drop_duplicates().tolist()
    unique_fields = agg_data['field'].drop_duplicates().tolist()
    unique_subfields = agg_data['subfield'].drop_duplicates().tolist()

    # Combine all unique values while maintaining order
    all_nodes = unique_domains + unique_fields + unique_subfields

    # Assuming agg_data is your aggregated DataFrame

    # Get unique values for each category
    domains = agg_data['domain'].drop_duplicates().tolist()
    fields = agg_data['field'].drop_duplicates().tolist()
    subfields = agg_data['subfield'].drop_duplicates().tolist()

    # Function to determine the type of node for color mapping
    def determine_node_type(node):
        if node in domains:
            return "rgba(31, 119, 180, 0.4)"
        elif node in fields:
            return "rgba(128, 0, 128, 0.4)"
        elif node in subfields:
            return "rgba(255, 165, 0, 0.4)"
        else:
            return "grey"  # Fallback color

    # Map each node in all_nodes to its color based on the node type
    node_colors = [determine_node_type(node) for node in all_nodes]

    # Now, node_colors should correspond to the actual occurrences of nodes in the Sankey diagram

    # Now continue with creating the links and the Sankey diagram as shown in previous examples
    # Creating a mapping from node to its index
    node_indices = {node: i for i, node in enumerate(all_nodes)}

    # Group by 'domain', 'field', and 'subfield' and sum the 'counts'
    grouped_links = agg_data.groupby(['domain', 'field', 'subfield'])['counts'].sum().reset_index()

    # Create links with aggregated values
    links = []
    for _, row in grouped_links.iterrows():
        # Domain to Field link
        links.append({
            'source': node_indices[row['domain']],
            'target': node_indices[row['field']],
            'value': row['counts']
        })
        # Field to Subfield link
        # Ensure that the subfield is also in the node_indices mapping
        if row['subfield'] in node_indices:
            links.append({
                'source': node_indices[row['field']],
                'target': node_indices[row['subfield']],
                'value': row['counts']
            })

    # Generate a unique color for each link if desired or use a default color
    link_colors = ["rgba(31, 119, 180, 0.4)" for _ in links]

    # Labels for each link (optional)
    link_labels = [""] * len(links)
    # Assuming that agg_data is your DataFrame
    node_colors_dict = {node: color for node, color in zip(all_nodes, node_colors)}
    # First, create a sum of counts from domain to field regardless of subfield
    domain_to_field_links = agg_data.groupby(['domain', 'field'])['counts'].sum().reset_index()

    # Next, create links that are broken down from field to subfield
    field_to_subfield_links = agg_data.groupby(['field', 'subfield'])['counts'].sum().reset_index()

    # Now create your aggregated links
    links = []
    link_colors = []

    # Create domain to field links
    for _, row in domain_to_field_links.iterrows():
        links.append({
            'source': node_indices[row['domain']],
            'target': node_indices[row['field']],
            'value': row['counts']
        })
        # Set the link color to match the target (field) node's color
        link_colors.append(node_colors_dict[all_nodes[node_indices[row['domain']]]])

    # Create field to subfield links
    for _, row in field_to_subfield_links.iterrows():
        if row['subfield'] in node_indices:  # Check if the subfield is in the indices
            links.append({
                'source': node_indices[row['field']],
                'target': node_indices[row['subfield']],
                'value': row['counts']
            })
            # Set the link color to match the target (subfield) node's color
            link_colors.append(node_colors_dict[all_nodes[node_indices[row['field']]]])

    # Define link_labels
    link_labels = [""] * len(links)  # Empty labels for each link

    fig = go.Figure(data=[go.Sankey(
        domain={'x': [0, 1], 'y': [0, 1]},
        orientation='h',
        valueformat='.0f',
        valuesuffix=' Thesis',
        node=dict(
            pad=15,
            thickness=40,
            line=dict(color='black', width = 0.2),
            label=all_nodes,
            color=node_colors
        ),
        link=dict(
            source=[link['source'] for link in links],
            target=[link['target'] for link in links],
            value=[link['value'] for link in links],
            color=link_colors,
            label=link_labels
        )
    )])

    # Update layout to adjust figure size
    fig.update_layout(
        title_text="Sankey Diagram",
        font_size=10,
        width=1200,  # Width of the figure in pixels
        height=600   # Height of the figure in pixels
    )



    return fig  # Return the updated figure


if __name__ == '__main__':
    app.run_server(debug=True)
