In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import time as time
import plotly.graph_objects as go
from scipy import stats
import math as math

In [None]:
path_students = r"/data/phd_students.h5"
path_coordinates = r"/data/coordinates.csv"

start = time.time()
phdStudents = pd.read_hdf(path_students)
phdStudents = pd.DataFrame(phdStudents)
coordinates_df = pd.read_csv(path_coordinates)
disciplines = coordinates_df.iloc[:, 0].tolist()
coordinates_df = coordinates_df.iloc[:, 1:]
matrix_coord = coordinates_df.to_numpy()
embedded = TSNE(n_components=2, learning_rate='auto', random_state=42, perplexity=5).fit_transform(matrix_coord)
nb_sups = 2
n = len(disciplines)
disc_colors = (px.colors.qualitative.Set2 + px.colors.qualitative.Set1 + px.colors.qualitative.Set3)[:n]
df_to_plot = pd.DataFrame(
    columns=["id_student", "x", "y", "name", "color", "distance_areas_supervisors", "num_pubs_student", "discipline_student_scopus"])

initializing = time.time() - start
print(initializing) #4.866900205612183

In [None]:
loop_index = 1
for i, student in phdStudents.iterrows():
    if loop_index % 1000 == 0:
        print("Processing student : ", loop_index,"/",len(phdStudents))
        # break
    loop_index = loop_index + 1
    main_disc = student["discipline_student_scopus"]
    student_name = student["name_student"].title()
    areas = np.array([float(x) for x in student["areas_student"][2:-2].split(", ")])
    nb_pub_student = int(student["num_pubs_student"])

    if nb_pub_student != 0:
        pubs = areas * nb_pub_student
        # to int values
        pubs = [int(x) for x in pubs]
        # compute coordinates
        coordinates = areas.dot(embedded)
        color = disc_colors[np.argmax(areas)] if areas.sum() > 0 else disc_colors[
            list.index(disciplines, main_disc)]
    else:
        # special label
        label = f"{student_name} ({main_disc}) n'a pas de publications"
        # give baricenter of supervisors for coordinates
        supervisors = [student[f"name_supervisor{i}"] for i in range(1, nb_sups + 1)]
        supervisors = [sup for sup in supervisors if type(sup) == str and sup != "nan" and sup != ""]
        supervisors_coords = [
            np.array([float(x) for x in student[f"areas_supervisor{i}"][2:-2].split(", ")]).dot(embedded)
            for i in range(1, len(supervisors) + 1)
        ]
        coordinates = np.mean(supervisors_coords, axis=0)
        color = "black"
    df_to_plot.loc[len(df_to_plot)] = {
        "id_student": student["id_scopus_student"],
        "x": coordinates[0],
        "y": coordinates[1],
        "name": student_name,
        "color": color,
        "distance_areas_supervisors": student["distance_areas_supervisors"],
        "num_pubs_student": nb_pub_student,
        "discipline_student_scopus": main_disc
    }
creating_df = time.time() - start
print(creating_df)

In [None]:
output_path = r"/data"

df_to_plot.to_parquet(output_path + r"\df_to_plot_withDist.parquet", engine="pyarrow")
end = time.time() - start

In [None]:
print("initializing : ", initializing)
print("creating df : ", creating_df)
print("end : ", end)

In [None]:
####################################################################################################################
# Disciplines
####################################################################################################################

disc_to_plot = pd.DataFrame(columns=["x", "y", "type", "name", "color", "size", "text", "label", "marker_symbol", "text_position", "nb_pubs"])
# Add disciplines
for i, disc in enumerate(disciplines):
    disc_to_plot.loc[len(disc_to_plot)] = {
        "x": embedded[i, 0],
        "y": embedded[i, 1],
        "type": "discipline",
        "name": disc,
        "color": disc_colors[i],
        "size": 30,
        "text": disc,
        "label": disc,
        "marker_symbol": "circle",
        "text_position": "middle center",
        "nb_pubs": 0
    }

disc_trace = go.Scattergl(
    name="Disciplines",
    x=disc_to_plot["x"].tolist(),
    y=disc_to_plot["y"].tolist(),
    mode='markers+text',
    marker=dict(
        color=disc_to_plot["color"].tolist(),
        size=disc_to_plot["size"].tolist(),
        symbol=disc_to_plot["marker_symbol"].tolist(),
        line=dict(width=0),
    ),
    text=disc_to_plot["text"].tolist(),
    textposition=disc_to_plot["text_position"].tolist(),
    hoverinfo='text',
    hovertext=disc_to_plot["label"].tolist(),
    opacity=1
)
print("disc_trace done")

In [None]:
test_df = df_to_plot.copy()
test_df["distance_areas_supervisors"].fillna(0, inplace=True)

In [None]:
df_to_plot = pd.read_parquet(r"/data/df_to_plot_withDist.parquet")

In [None]:
df_to_plot.head(100)

In [None]:
# Ignore the phd students with coordinates 0,0
disc_xs = list(disc_to_plot["x"])
disc_ys = list(disc_to_plot["y"])
df_to_plot = df_to_plot[(
        (df_to_plot["x"]!=0) | (df_to_plot["y"]!=0)
)]
# Ignore the phd students with coordinates on discipline coordinates
df_to_plot = df_to_plot[(
        (~df_to_plot["x"].isin(disc_xs)) | (~df_to_plot["y"].isin(disc_ys))
)]

In [None]:
# Remove students with no publications
df_to_plot = df_to_plot[df_to_plot["num_pubs_student"] > 0]

In [None]:
# Sort the students by number of publications
df_to_plot = df_to_plot.sort_values(by=["num_pubs_student"], ascending=True)

In [None]:
sample_size = len(df_to_plot)
print("sample_size", sample_size)

In [None]:
####################################################################################################################
# Scatter plot of PhD students
####################################################################################################################
# assign color based on disc_color
colors = [disc_colors[disciplines.index(disc)] for disc in df_to_plot["discipline_student_scopus"]]
print("colors assigned")
phdStudents_go = go.Scattergl(
    name="PhD students",
    x=df_to_plot["x"].tolist(),
    y=df_to_plot["y"].tolist(),
    mode='markers',
    marker=dict(
        color=colors,
        # colorscale='Plasma',
        # showscale=True,
        size=3,
        # line=dict(width=0)
    ),
    opacity=1,
    hoverinfo='text',
    hovertext=df_to_plot["num_pubs_student"]
    )
fig_student = go.Figure(phdStudents_go)
fig_student.add_trace(disc_trace)
fig_student.update_layout(
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly"
)

fig_student.write_image(f"./results_new/fig{sample_size}_disc_scatter.png")
fig_student.write_html(f"./results_new/fig{sample_size}_disc_scatter.html")
fig_student.show()

In [None]:
####################################################################################################################
# Publications heatmap
####################################################################################################################
# hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
#     df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), df_to_plot["num_pubs_student"].tolist(), statistic='mean', bins=[150,100]
# )
hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
    df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), [np.log1p(nb) for nb in df_to_plot["num_pubs_student"].tolist()], statistic='mean', bins=[150,100]
)
# Convert 0 values to NaN for transparency
hist = np.where(hist == 0, np.nan, hist)  # Set 0s to NaN
pubs_heatmap = go.Heatmap(
    name="Publications heatmap",
    x = x_edges[:-1],
    y = y_edges[:-1],
    z = hist.T,
    zsmooth= 'best',
    # zauto=False,
    # zmax=10,
    colorscale="Plasma",
    hovertext=df_to_plot["num_pubs_student"].tolist(),
    colorbar=dict(title='Number of publications'),
    showscale=True,
    showlegend=True
)
fig_pubs_heatmap = go.Figure(pubs_heatmap)
fig_pubs_heatmap.add_trace(disc_trace)
fig_pubs_heatmap.update_layout(
    title="Publications heatmap",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly",
)

fig_pubs_heatmap.write_image(f"./results_new/fig{sample_size}_productivity_heatmap_log_smooth.png")
fig_pubs_heatmap.write_html(f"./results_new/fig{sample_size}_productivity_heatmap_log_smooth.html")
fig_pubs_heatmap.show()

In [None]:
####################################################################################################################
# Density heatmap
####################################################################################################################
density_heatmap = go.Histogram2d(
    x=df_to_plot["x"],
    y=df_to_plot["y"],
    nbinsx=150,
    nbinsy=100,
    colorscale=["rgba(68, 1, 84,0)"]+px.colors.sequential.Viridis,
    colorbar=dict(title="Density"),
    name="Density heatmap",
    showscale=True,
    showlegend=True,
    histnorm="density"
)
fig_density_heatmap = go.Figure(density_heatmap)
fig_density_heatmap.add_trace(disc_trace)
fig_density_heatmap.update_layout(
    title="Density heatmap",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    # plot_bgcolor="rgb(0,0,0)",
)
fig_density_heatmap.write_image(f"./results_new/fig{sample_size}_density_heatmap.png")
fig_density_heatmap.write_html(f"./results_new/fig{sample_size}_density_heatmap.html")
fig_density_heatmap.show()

In [None]:
####################################################################################################################
# Supervisor Distance heatmap
####################################################################################################################
hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
    df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), test_df["distance_areas_supervisors"].tolist(), statistic='mean', bins=[150,100]
)
# Convert 0 values to NaN for transparency
hist = np.where(hist == 0, np.nan, hist)  # Set 0s to NaN
sup_dist_heatmap = go.Heatmap(
    name="Supervisor Distance heatmap",
    x = x_edges[:-1],
    y = y_edges[:-1],
    z = hist.T,
    colorscale="Plasma",
    hovertext=df_to_plot["distance_areas_supervisors"].tolist(),
    colorbar=dict(title='Supervisor Distance'),
    showscale=True,
    showlegend=True
)
fig_sup_dist_heatmap = go.Figure(sup_dist_heatmap)
fig_sup_dist_heatmap.add_trace(disc_trace)
fig_sup_dist_heatmap.update_layout(
    title="Supervisor Distance",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly",
)

fig_sup_dist_heatmap.write_image(f"./results_new/fig{sample_size}_sup_dist_heatmap.png")
fig_sup_dist_heatmap.write_html(f"./results_new/fig{sample_size}_sup_dist_heatmap.html")
fig_sup_dist_heatmap.show()

In [None]:
####################################################################################################################
# Supervisor Distance scatter plot
####################################################################################################################
sup_dist_scatter = go.Scattergl(
    name="Supervisor Distance",
    x=df_to_plot["x"].tolist(),
    y=df_to_plot["y"].tolist(),
    mode='markers',
    marker=dict(
        color=test_df["distance_areas_supervisors"].tolist(),
        colorscale='Plasma',
        showscale=True,
        size=7,
        line=dict(width=0)
    ),
    opacity=1,
    hoverinfo='text',
    hovertext=df_to_plot["distance_areas_supervisors"]
    )
fig_sup_disc_scatter = go.Figure(sup_dist_scatter)
fig_sup_disc_scatter.add_trace(disc_trace)
fig_sup_disc_scatter.update_layout(
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly"
)

fig_sup_disc_scatter.write_image(f"./results_new/fig{sample_size}_sup_disc_scatter.png")
fig_sup_disc_scatter.write_html(f"./results_new/fig{sample_size}_sup_disc_scatter.html")
fig_sup_disc_scatter.show()

In [None]:
####################################################################################################################
# Full figure
####################################################################################################################
figure = go.Figure()
figure.add_trace(phdStudents_go)
figure.add_trace(pubs_heatmap)
figure.add_trace(density_heatmap)
figure.add_trace(disc_trace)
figure.update_layout(
    title="Overlapping Figures",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)
figure.write_image(f"./results_new/fig{sample_size}_all.png")
figure.write_html(f"./results_new/fig{sample_size}_all.html")
figure.show()