In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import time as time
import plotly.graph_objects as go
from scipy import stats

In [None]:
path_students = r"../data/phd_students.h5"
path_coordinates = r"../data/coordinates.csv"

start = time.time()
phdStudents = pd.read_hdf(path_students)
phdStudents = pd.DataFrame(phdStudents)
coordinates_df = pd.read_csv(path_coordinates)
disciplines = coordinates_df.iloc[:, 0].tolist()
coordinates_df = coordinates_df.iloc[:, 1:]
matrix_coord = coordinates_df.to_numpy()
embedded = TSNE(n_components=2, learning_rate='auto', random_state=42, perplexity=5).fit_transform(matrix_coord)
nb_sups = 2
n = len(disciplines)
disc_colors = (px.colors.qualitative.Set2 + px.colors.qualitative.Set1 + px.colors.qualitative.Set3)[:n]
df_to_plot = pd.DataFrame(
    columns=["id_student", "x", "y", "name", "color", "distance_areas_supervisors", "num_pubs_student", "discipline_student_scopus"])

initializing = time.time() - start
print(initializing) #4.866900205612183

In [None]:
# loop_index = 1
# for i, student in phdStudents.iterrows():
#     if loop_index % 1000 == 0:
#         print("Processing student : ", loop_index,"/",len(phdStudents))
#         # break
#     loop_index = loop_index + 1
#     main_disc = student["discipline_student_scopus"]
#     student_name = student["name_student"].title()
#     areas = np.array([float(x) for x in student["areas_student"][2:-2].split(", ")])
#     nb_pub_student = int(student["num_pubs_student"])
#
#     if nb_pub_student != 0:
#         pubs = areas * nb_pub_student
#         # to int values
#         pubs = [int(x) for x in pubs]
#         # compute coordinates
#         coordinates = areas.dot(embedded)
#         color = disc_colors[np.argmax(areas)] if areas.sum() > 0 else disc_colors[
#             list.index(disciplines, main_disc)]
#     else:
#         # special label
#         label = f"{student_name} ({main_disc}) n'a pas de publications"
#         # give barycenter of supervisors for coordinates
#         supervisors = [student[f"name_supervisor{i}"] for i in range(1, nb_sups + 1)]
#         supervisors = [sup for sup in supervisors if type(sup) == str and sup != "nan" and sup != ""]
#         supervisors_coords = [
#             np.array([float(x) for x in student[f"areas_supervisor{i}"][2:-2].split(", ")]).dot(embedded)
#             for i in range(1, len(supervisors) + 1)
#         ]
#         coordinates = np.mean(supervisors_coords, axis=0)
#         color = "black"
#     df_to_plot.loc[len(df_to_plot)] = {
#         "id_student": student["id_scopus_student"],
#         "x": coordinates[0],
#         "y": coordinates[1],
#         "name": student_name,
#         "color": color,
#         "distance_areas_supervisors": student["distance_areas_supervisors"],
#         "num_pubs_student": nb_pub_student,
#         "discipline_student_scopus": main_disc
#     }
# creating_df = time.time() - start
# print(creating_df)

In [None]:
# output_path = r"../data"
#
# df_to_plot.to_parquet(output_path + r"\df_to_plot_withDist.parquet", engine="pyarrow")
# end = time.time() - start

In [None]:
# print("initializing : ", initializing)
# print("creating df : ", creating_df)
# print("end : ", end)

In [None]:
####################################################################################################################
# Disciplines
####################################################################################################################

disc_to_plot = pd.DataFrame(columns=["x", "y", "type", "name", "color", "size", "text", "label", "marker_symbol", "text_position", "nb_pubs"])
# Add disciplines
for i, disc in enumerate(disciplines):
    disc_to_plot.loc[len(disc_to_plot)] = {
        "x": embedded[i, 0],
        "y": embedded[i, 1],
        "type": "discipline",
        "name": disc,
        "color": disc_colors[i],
        "size": 30,
        "text": disc,
        "label": disc,
        "marker_symbol": "circle",
        "text_position": "middle center",
        "nb_pubs": 0
    }

disc_trace = go.Scattergl(
    name="Disciplines",
    x=disc_to_plot["x"].tolist(),
    y=disc_to_plot["y"].tolist(),
    mode='markers+text',
    marker=dict(
        color=disc_to_plot["color"].tolist(),
        size=disc_to_plot["size"].tolist(),
        symbol=disc_to_plot["marker_symbol"].tolist(),
        line=dict(width=0),
    ),
    text=disc_to_plot["text"].tolist(),
    textposition=disc_to_plot["text_position"].tolist(),
    hoverinfo='text',
    hovertext=disc_to_plot["label"].tolist(),
    opacity=1
)
print("disc_trace done")

In [None]:
df_to_plot = pd.read_parquet(r"../data/df_to_plot_withDist.parquet")

In [None]:
df_to_plot.head(100)

In [None]:
# # Ignore the phd students with coordinates 0,0
# disc_xs = list(disc_to_plot["x"])
# disc_ys = list(disc_to_plot["y"])
# df_to_plot = df_to_plot[(
#         (df_to_plot["x"]!=0) | (df_to_plot["y"]!=0)
# )]
# # # Ignore the phd students with coordinates on discipline coordinates
# df_to_plot = df_to_plot[(
#         (~df_to_plot["x"].isin(disc_xs)) | (~df_to_plot["y"].isin(disc_ys))
# )]

In [None]:
# Remove students with no publications
# df_to_plot = df_to_plot[df_to_plot["num_pubs_student"] > 0]

In [None]:
# Sort the students by number of publications
# df_to_plot = df_to_plot.sort_values(by=["num_pubs_student"], ascending=True)

In [None]:
sample_size = len(df_to_plot)
print("sample_size", sample_size)

In [None]:
len(px.colors.sequential.Blues)

### Scatter plot of PhD students

In [None]:
def add_scatter_trace(fig, df, name, color):
    fig.add_trace(go.Scattergl(
        name=name,
        x=df["x"],
        y=df["y"],
        mode='markers',
        marker=dict(
            color=color,
            size=3,
            line=dict(width=0),
        ),
        opacity=1,
        hoverinfo='text',
        hovertext=name
    ))

colorscale = px.colors.sequential.Turbo.copy()
custom_scatter = go.Figure()
for i in range(0, 10):
    add_scatter_trace(custom_scatter, df_to_plot[df_to_plot["num_pubs_student"]==i], f"{i} Pubs", colorscale.pop())
for i in range(10, 30, 5):
    add_scatter_trace(custom_scatter, df_to_plot[(i <= df_to_plot["num_pubs_student"]) & (df_to_plot["num_pubs_student"]<i+5)], f"{i} to {i+5} Pubs", colorscale.pop())
add_scatter_trace(custom_scatter, df_to_plot[df_to_plot["num_pubs_student"]>25], f"30+ Pubs", colorscale.pop())
custom_scatter.add_trace(disc_trace)
custom_scatter.update_layout(
    title="PhD students productivity",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        itemsizing= "constant",
    ),
    template="plotly"
)
save_path_scatter = f"./results_new/fig{sample_size}_pub_scatter"
custom_scatter.write_image(f"{save_path_scatter}.png")
custom_scatter.write_html(f"{save_path_scatter}.html")
custom_scatter.show()

In [None]:
# assign color based on disc_color
# colors = [disc_colors[disciplines.index(disc)] for disc in df_to_plot["discipline_student_scopus"]]
# print("colors assigned")
# log_coloring = [np.log1p(nb) for nb in df_to_plot["num_pubs_student"].tolist()]
phdStudents_go = go.Scattergl(
    name="PhD students",
    x=df_to_plot["x"],
    y=df_to_plot["y"],
    mode='markers',
    marker=dict(
        color=df_to_plot["num_pubs_student"],
        colorscale=px.colors.sequential.Plasma,
        showscale=True,
        size=3,
        line=dict(width=0),
    ),
    opacity=1,
    hoverinfo='text',
    hovertext=df_to_plot["num_pubs_student"]
    )
fig_student = go.Figure(phdStudents_go)
fig_student.add_trace(disc_trace)
fig_student.update_layout(
    title="PhD students productivity",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly"
)

# save_path_scatter = f"./results_new/fig{sample_size}_pub_scatter"
# fig_student.write_image(f"{save_path_scatter}.png")
# fig_student.write_html(f"{save_path_scatter}.html")
fig_student.show()

### Publications heatmap

In [None]:
hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
    df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), df_to_plot["num_pubs_student"].tolist(), statistic='mean', bins=[150,100]
)
# hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
#     df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), [np.log1p(nb) for nb in df_to_plot["num_pubs_student"].tolist()], statistic='mean', bins=[150,100]
# )
# Convert 0 values to NaN for transparency
hist = np.where(hist == 0, np.nan, hist)  # Set 0s to NaN

pubs_heatmap = go.Heatmap(
    name="Publications heatmap",
    x = x_edges[:-1],
    y = y_edges[:-1],
    z = hist.T,
    # zsmooth= 'best',
    # zauto=False,
    # zmax=10,
    colorscale="Plasma",
    hovertext=df_to_plot["num_pubs_student"].tolist(),
    colorbar=dict(title='Number of publications'),
    showscale=True,
    showlegend=True
)
fig_pubs_heatmap = go.Figure(pubs_heatmap)
fig_pubs_heatmap.add_trace(disc_trace)
fig_pubs_heatmap.update_layout(
    title="Publications heatmap",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly",
)
save_path_pubs = f"./results_new/fig{sample_size}_productivity_heatmap"
fig_pubs_heatmap.write_image(f"{save_path_pubs}.png")
fig_pubs_heatmap.write_html(f"{save_path_pubs}.html")
fig_pubs_heatmap.show()

### Density heatmap

In [None]:
density_heatmap = go.Histogram2d(
    x=df_to_plot["x"],
    y=df_to_plot["y"],
    nbinsx=150,
    nbinsy=100,
    colorscale=["rgba(68, 1, 84,0)"]+px.colors.sequential.Viridis,
    colorbar=dict(title="Density"),
    name="Density heatmap",
    showscale=True,
    showlegend=True,
    histnorm="density"
)
fig_density_heatmap = go.Figure(density_heatmap)
fig_density_heatmap.add_trace(disc_trace)
fig_density_heatmap.update_layout(
    title="Density heatmap no zero no uni-disciplinary",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template = "plotly"
)
fig_density_heatmap.write_image(f"./results_new/fig{sample_size}_density_heatmap_noZero_noDisc.png")
fig_density_heatmap.write_html(f"./results_new/fig{sample_size}_density_heatmap_noZero_noDisc.html")
fig_density_heatmap.show()

### Supervisor Distance

In [None]:
####################################################################################################################
# Supervisor Distance heatmap
####################################################################################################################
hist, x_edges, y_edges, binnumber = stats.binned_statistic_2d(
    df_to_plot["x"].tolist(), df_to_plot["y"].tolist(), df_to_plot["distance_areas_supervisors"].tolist(), statistic='mean', bins=[150,100]
)
# Convert 0 values to NaN for transparency
hist = np.where(hist == 0, np.nan, hist)  # Set 0s to NaN
sup_dist_heatmap = go.Heatmap(
    name="Supervisor Distance heatmap",
    x = x_edges[:-1],
    y = y_edges[:-1],
    z = hist.T,
    colorscale="Plasma",
    hovertext=df_to_plot["distance_areas_supervisors"].tolist(),
    colorbar=dict(title='Supervisor Distance'),
    showscale=True,
    showlegend=True
)
fig_sup_dist_heatmap = go.Figure(sup_dist_heatmap)
fig_sup_dist_heatmap.add_trace(disc_trace)
fig_sup_dist_heatmap.update_layout(
    title="Supervisor Distance",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly",
)

fig_sup_dist_heatmap.write_image(f"./results_new/fig{sample_size}_sup_dist_heatmap.png")
fig_sup_dist_heatmap.write_html(f"./results_new/fig{sample_size}_sup_dist_heatmap.html")
fig_sup_dist_heatmap.show()

In [None]:
####################################################################################################################
# Supervisor Distance scatter plot
####################################################################################################################
sup_dist_scatter = go.Scattergl(
    name="Supervisor Distance",
    x=df_to_plot["x"].tolist(),
    y=df_to_plot["y"].tolist(),
    mode='markers',
    marker=dict(
        color=df_to_plot["distance_areas_supervisors"].tolist(),
        colorscale='Plasma',
        showscale=True,
        size=7,
        line=dict(width=0)
    ),
    opacity=1,
    hoverinfo='text',
    hovertext=df_to_plot["distance_areas_supervisors"]
    )
fig_sup_disc_scatter = go.Figure(sup_dist_scatter)
fig_sup_disc_scatter.add_trace(disc_trace)
fig_sup_disc_scatter.update_layout(
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly"
)

fig_sup_disc_scatter.write_image(f"./results_new/fig{sample_size}_sup_disc_scatter.png")
fig_sup_disc_scatter.write_html(f"./results_new/fig{sample_size}_sup_disc_scatter.html")
fig_sup_disc_scatter.show()

### Full figure

In [None]:
figure = go.Figure()
figure.add_trace(phdStudents_go)
figure.add_trace(pubs_heatmap)
figure.add_trace(density_heatmap)
figure.add_trace(disc_trace)
figure.update_layout(
    title="Overlapping Figures",
    showlegend=True,
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)
figure.write_image(f"./results_new/fig{sample_size}_all.png")
figure.write_html(f"./results_new/fig{sample_size}_all.html")
figure.show()

## Temporal evolution

In [None]:
theses = pd.read_csv(r"../data/theses.csv")

In [None]:
# Add new column for the number of supervisors which is the number of not empty "directeurs_these.i.nom"
theses["nb_sup"] = theses[["directeurs_these.0.nom", "directeurs_these.1.nom", "directeurs_these.2.nom", "directeurs_these.3.nom", "directeurs_these.4.nom", "directeurs_these.5.nom", "directeurs_these.6.nom"]].notna().sum(axis=1)

### Make dataframe

In [None]:
og_dict = {"COMP": ["000", "004"],
           "PSYC": ["020", "060", "070", "090", "100", "110", "120", "130", "140", "150", "160", "170", "180", "190",
                    "200",
                    "210", "220", "230", "240", "250", "260", "270", "280", "290"],
           "SOCI": ["300", "350", "360", "370", "380", "390"],
           "ECON": ["310", "320", "330", "340"],
           "ARTS": ["400", "410", "420", "430", "440", "450", "460", "470", "480", "490", "700", "710", "720", "730",
                    "740",
                    "750", "760", "770", "780", "790", "800", "810", "820", "830", "840", "850", "860", "870", "880",
                    "890",
                    "900", "910", "920", "930", "940", "944", "950", "960", "970", "980", "990"],
           "MATH": ["500", "510"],
           "PHYS": ["520", "530"],
           "CHEM": ["540"],
           "EART": ["550", "560"],
           "BIOC": ["570", "580", "590"],
           "ENGI": ["600", "620", "670", "680", "690"],
           "MEDI": ["610", "796"],
           "VETE": ["630"],
           "DECI": ["640"],
           "BUSI": ["650"],
           "CENGI": ["660"]
           }

d = {}
for word, values in og_dict.items():
    for value in values:
        d[value] = word

nb_directeurs = 6
director_ids = [f"directeurs_these.{i}.idref" for i in range(0, nb_directeurs + 1)]

# Students with no idref are assigned value of -1 * index
theses.loc[theses["auteur.idref"].isna(), "auteur.idref"] = (
            "P" + pd.Series(theses.index[theses["auteur.idref"].isna()]).astype(str)).values
# Directors with no idref are assigned of -1 * (own index + 1 + index of the student)
for i in range(0, nb_directeurs + 1):
    mask = theses[f"directeurs_these.{i}.idref"].isna() & (
            theses[f"directeurs_these.{i}.nom"].notna() | theses[f"directeurs_these.{i}.prenom"].notna()
    )
    theses.loc[mask, f"directeurs_these.{i}.idref"] = "D" + pd.Series(i + 1 + theses.index[mask]).astype(str)

students = theses[["auteur.idref", "auteur.nom", "auteur.prenom", "date_soutenance", "oai_set_specs", "nb_sup"]]
students.columns = ["idref", "nom", "prenom", "date_soutenance", "discipline", "nb_sup"]
# Clean the discipline column by removing the prefix "ddc:" and assigning the corresponding discipline based on the mapping
students["discipline"] = students["discipline"].str.replace("ddc:", "")
# If discipline contains "||", assign "MULT", otherwise map to the corresponding discipline or None
students["discipline"] = students["discipline"].apply(
    lambda x: "MULT" if type(x) is str and "||" in x else d.get(x, None))
students["date_soutenance"] = students["date_soutenance"].astype(str).str.split('-').str[0]
# Remove all "nan" value in date_soutenance
students = students[students["date_soutenance"] != "nan"]
students.to_parquet(r"../data/students.parquet", engine="pyarrow")

### Make figures

In [None]:
# Drop columns with no date
time_evolution = students.dropna(subset=["date_soutenance"])

In [None]:
# Write only years in date_soutenance
time_evolution["date_soutenance"] = time_evolution["date_soutenance"].astype(str).str.split('-').str[0]
# Group by year and count the number of theses
overall_time_evolution = time_evolution.groupby("date_soutenance").size().reset_index(name="count")
# Group by year and discipline and count the number of theses
per_disc_time_evolution = time_evolution.groupby(["date_soutenance", "discipline"]).size().reset_index(name="count")
# Group by year according to number of supervisors
per_sup_time_evolution = time_evolution.groupby(["date_soutenance", "nb_sup"]).size().reset_index(name="count")

In [None]:
overall_time_evolution.to_parquet(r"../data/overall_time_evolution.parquet", engine="pyarrow")
per_disc_time_evolution.to_parquet(r"../data/per_disc_time_evolution.parquet", engine="pyarrow")
per_sup_time_evolution.to_parquet(r"../data/per_sup_time_evolution.parquet", engine="pyarrow")

In [None]:
figure = go.Figure()
figure.add_trace(go.Scatter(
    x=overall_time_evolution["date_soutenance"],
    y=overall_time_evolution["count"],
    connectgaps=True,
    mode='lines+markers',
    name='Overall',
    line=dict(color='blue')
))
figure.add_trace(go.Scatter(
    x=per_disc_time_evolution["date_soutenance"],
    y=per_disc_time_evolution["count"],
    connectgaps=True,
    mode='lines+markers',
    name='Per Discipline',
    line=dict(color='red')
))
figure.add_trace(go.Scatter(
    x=per_sup_time_evolution["date_soutenance"],
    y=per_sup_time_evolution["count"],
    connectgaps=True,
    mode='lines+markers',
    name='Per Supervisor',
    line=dict(color='green')
))

In [None]:
figure.update_layout(
    title="Temporal Evolution of Theses",
    xaxis_title="Year",
    yaxis_title="Number of Theses",
    showlegend=True,
    template="plotly"
)
figure.write_image(f"./results_new/temporal_evolution.png")
figure.write_html(f"./results_new/temporal_evolution.html")
figure.show()

In [None]:
import pandas as pd
theses = pd.read_csv(r"C:\Users\sayfe\Desktop\PER\MultidisciplinaryPhD_Aurora\data\raw\theses\theses-soutenues.csv")
# Retrieve all columns with name containing "idref"
columns = theses.columns.tolist().copy()
filtered_columns = [column for column in columns if "idref" in column]
# Change nan to empty
for column in filtered_columns :
    theses[column] = theses[column].astype(str)
# shorten date_soutenance to only the year
theses["date_soutenance"] = theses["date_soutenance"].astype(str).str.split('-').str[0]
theses_copy = theses.copy(deep=True)

In [None]:
# drop line with date equal to nan
theses = theses[theses["date_soutenance"] != "nan"]
# transform value in oai into arrays
theses["oai_set_specs"] = theses["oai_set_specs"].apply(lambda oai : str(oai).split("||"))

In [None]:
theses = theses.explode("oai_set_specs")
# theses.head(100)

In [None]:
# Remove special character from theses["discipline"]
theses["discipline"] = theses["discipline"].str.replace(":", "").replace("[", "").replace("]", "").replace("'", "").replace("?", "").replace(".", "").replace('"', "")

In [None]:
theses.to_parquet(r"C:\Users\sayfe\Desktop\PER\multdisciplinaryOnlineTool\data\theses-explode.parquet", engine="pyarrow")

In [None]:
import pandas as pd
theses = pd.read_parquet(r"C:\Users\sayfe\Desktop\PER\multdisciplinaryOnlineTool\data\theses-explode.parquet", engine="pyarrow")
ddc_dict = {
    "ddc:000": "Informatique, information, généralités",
    "ddc:004": "Informatique",
    "ddc:020": "Bibliothéconomie et sciences de l'information",
    "ddc:060": "Organisations générales et muséologie",
    "ddc:070": "Médias d'information, journalisme, édition",
    "ddc:090": "Manuscrits et livres rares",
    "ddc:100": "Philosophie, psychologie",
    "ddc:110": "Métaphysique",
    "ddc:120": "Epistémologie, causalité, genre humain",
    "ddc:130": "Phénomènes paranormaux, seudosciences",
    "ddc:140": "Les divers systèmes et écoles philosophiques",
    "ddc:150": "Psychologie",
    "ddc:160": "Logique",
    "ddc:170": "Morale (éthique)",
    "ddc:180": "Philosophie de l'Antiquité, du Moyen Âge, de l'Orient",
    "ddc:190": "Philosophie occidentale moderne et philosophies non orientales",
    "ddc:200": "Religion",
    "ddc:210": "Philosophie et théorie de la religion",
    "ddc:220": "Bible",
    "ddc:230": "Théologie chrétienne",
    "ddc:240": "Théologie morale et pratiques chrétiennes",
    "ddc:250": "Eglises locales, ordres religieux chrétiens",
    "ddc:260": "Théologie chrétienne et société, ecclésiologie",
    "ddc:270": "Histoire et géographie du christianisme et de l'Eglise chrétienne",
    "ddc:280": "Confessions et sectes de l'Eglise chrétienne",
    "ddc:290": "Autres religions",
    "ddc:300": "Sciences sociales, sociologie, anthropologie",
    "ddc:310": "Statistiques générales",
    "ddc:320": "Science politique",
    "ddc:330": "Economie",
    "ddc:340": "Droit",
    "ddc:350": "Administration publique. Arts et science militaires",
    "ddc:360": "Problèmes et services sociaux",
    "ddc:370": "Education et enseignement",
    "ddc:380": "Commerce, communications, transports",
    "ddc:390": "Ethnologie",
    "ddc:400": "Langues et linguistique",
    "ddc:410": "Linguistique générale",
    "ddc:420": "Langue anglaise. Anglo-saxon",
    "ddc:430": "Langues germaniques. Allemand",
    "ddc:440": "Langues romanes. Français",
    "ddc:450": "Langues italienne, roumaine, rhéto-romane",
    "ddc:460": "Langues espagnole et portugaise",
    "ddc:470": "Langues italiques. Latin",
    "ddc:480": "Langues helléniques. Grec classique",
    "ddc:490": "Autres langues",
    "ddc:500": "Sciences de la nature et mathématiques",
    "ddc:510": "Mathématiques",
    "ddc:520": "Astronomie, cartographie, géodésie",
    "ddc:530": "Physique",
    "ddc:540": "Chimie, minéralogie, cristallographie",
    "ddc:550": "Sciences de la terre",
    "ddc:560": "Paléontologie. Paléozoologie",
    "ddc:570": "Sciences de la vie, biologie, biochimie",
    "ddc:580": "Plantes. Botanique",
    "ddc:590": "Animaux. Zoologie",
    "ddc:600": "Technologie (Sciences appliquées)",
    "ddc:610": "Médecine et santé",
    "ddc:620": "Sciences de l'ingénieur",
    "ddc:630": "Agronomie, agriculture et médecine vétérinaire",
    "ddc:640": "Economie domestique. Vie familiale",
    "ddc:650": "Gestion et organisation de l'entreprise",
    "ddc:660": "Génie chimique, technologies alimentaires",
    "ddc:670": "Fabrication industrielle",
    "ddc:680": "Fabrication de produits à usages spécifiques",
    "ddc:690": "Bâtiments",
    "ddc:700": "Arts. Beaux-arts et arts décoratifs",
    "ddc:710": "Urbanisme",
    "ddc:720": "Architecture",
    "ddc:730": "Arts plastiques. Sculpture",
    "ddc:740": "Dessin. Arts décoratifs",
    "ddc:750": "Peinture",
    "ddc:760": "Arts graphiques",
    "ddc:770": "Photographie et les photographies, art numérique",
    "ddc:780": "Musique",
    "ddc:790": "Arts du spectacle, loisirs",
    "ddc:796": "Sport",
    "ddc:800": "Histoire et critique littéraires, rhétorique",
    "ddc:810": "Littérature américaine en anglais",
    "ddc:820": "Littératures anglaise et anglo-saxonne",
    "ddc:830": "Littérature allemande",
    "ddc:840": "Littérature de langues romanes. Littérature française",
    "ddc:850": "Littérature italienne",
    "ddc:860": "Littératures espagnole et portugaise",
    "ddc:870": "Littérature latine",
    "ddc:880": "Littérature grecque",
    "ddc:890": "Littératures des autres langues",
    "ddc:900": "Géographie et histoire",
    "ddc:910": "Géographie et voyages",
    "ddc:920": "Biographies générales, généalogie, emblèmes",
    "ddc:930": "Histoire ancienne et préhistoire",
    "ddc:940": "Histoire moderne et contemporaine de l'Europe",
    "ddc:944": "Histoire générale de la France",
    "ddc:950": "Histoire générale de l'Asie, Orient, Extrême-Orient",
    "ddc:960": "Histoire générale de l'Afrique",
    "ddc:970": "Histoire générale de l'Amérique du Nord",
    "ddc:980": "Histoire générale de l'Amérique du Sud",
    "ddc:990": "Histoire générale des autres parties du monde, des mondes extraterrestres. Iles du Pacifique"
}
# revert dict
# disc_to_ddc_dict = {v: k for k, v in ddc_dict.items()}

In [None]:
# map ddc to discipline
def map_ddc_to_discipline(ddc):
    return ddc_dict.get(ddc, None)

# map discipline to ddc
# def map_discipline_to_ddc(discipline):
#     return disc_to_ddc_dict.get(discipline, None)

# Apply the mapping function to the discipline column
theses["oai_set_specs"] = theses["oai_set_specs"].apply(map_ddc_to_discipline)
# Remove rows with no discipline
# theses = theses[theses["discipline"].notna()]

# Keep only idref nom prenom date_soutencance discipline
theses = theses[["auteur.idref", "auteur.nom", "auteur.prenom", "date_soutenance", "oai_set_specs"]]

In [None]:
# Save the dataframe
theses.to_parquet(r"C:\Users\sayfe\Desktop\PER\multdisciplinaryOnlineTool\data\theses-explode-disc.parquet", engine="pyarrow")

In [None]:
len(ddc_dict)

In [None]:
# retrieve 98 colors from plotly
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

colors = (px.colors.qualitative.Alphabet + px.colors.qualitative.Light24 + px.colors.qualitative.Set2 + px.colors.qualitative.Set1 + px.colors.qualitative.Set3 + px.colors.qualitative.Plotly + px.colors.qualitative.Prism)[:len(ddc_dict)]

In [None]:
len(colors)

In [None]:
# order dict by values
# ddc_dict = dict(sorted(ddc_dict.items(), key=lambda item: item[1]))

# build line graph for number of occurrences of each discipline per ordered years
import plotly.graph_objects as go

theses.sort_values(by=["date_soutenance"], inplace=True)
figure = go.Figure()
for i, (discipline, color) in enumerate(zip(ddc_dict.values(), colors)):
    # Filter the dataframe for the current discipline
    discipline_df = theses[theses["oai_set_specs"] == discipline]
    # Group by year and count the number of theses
    discipline_time_evolution = discipline_df.groupby("date_soutenance").size().reset_index(name="count")
    figure.add_trace(go.Scatter(
        x=discipline_time_evolution["date_soutenance"],
        y=discipline_time_evolution["count"],
        connectgaps=True,
        mode='lines+markers',
        name=discipline,
        line=dict(color=color)
    ))

toggle_button = dict(
    type="buttons",
    direction="right",
    x=1,
    y=1,
    xanchor="right",
    yanchor="top",
    pad=dict(r=10, t=10),
    buttons=[
        dict(
            label="Show Legend",
            method="relayout",
            args=[{"showlegend": True}]
        ),
        dict(
            label="Hide Legend",
            method="relayout",
            args=[{"showlegend": False}]
        )
    ]
)

figure.update_layout(
    height=700,
    showlegend=False,
    updatemenus=[toggle_button],
    title="Temporal Evolution of Disciplines DDC",
    xaxis_title="Year",
    yaxis_title="Number of Theses",
    template="plotly"
)
figure.update_xaxes(categoryorder='category ascending')
# order legend alphabetically

figure.write_image(f"./results_new/temporal_evolution_disciplines.png")
figure.write_html(f"./results_new/temporal_evolution_disciplines.html")
figure.show()

In [None]:
students = pd.read_parquet(r"../data/students.parquet", engine="pyarrow")

disciplines = ['AGRI', 'ARTS', 'BIOC', 'BUSI', 'CENG', 'COMP', 'DECI', 'DENT', 'EART', 'ECON', 'ENER', 'ENGI', 'ENVI', 'HEAL', 'IMMU', 'MATE', 'MATH', 'MEDI', 'MULT', 'NEUR', 'NURS', 'PHAR', 'PHYS', 'PSYC', 'SOCI', 'VETE']

# build line graph for number of occurrences of each discipline per ordered years
figure = go.Figure()
for i, (discipline, color) in enumerate(zip(disciplines, colors)):
    # Filter the dataframe for the current discipline
    discipline_df = students[students["discipline"] == discipline]
    # Group by year and count the number of theses
    discipline_time_evolution = discipline_df.groupby("date_soutenance").size().reset_index(name="count")
    figure.add_trace(go.Scatter(
        x=discipline_time_evolution["date_soutenance"],
        y=discipline_time_evolution["count"],
        connectgaps=True,
        mode='lines+markers',
        name=discipline,
        line=dict(color=color)
    ))
toggle_button = dict(
    type="buttons",
    direction="right",
    x=1,
    y=1,
    xanchor="right",
    yanchor="top",
    pad=dict(r=10, t=10),
    buttons=[
        dict(
            label="Show Legend",
            method="relayout",
            args=[{"showlegend": True}]
        ),
        dict(
            label="Hide Legend",
            method="relayout",
            args=[{"showlegend": False}]
        )
    ]
)

figure.update_layout(
    height=700,
    showlegend=False,
    updatemenus=[toggle_button],
    title="Temporal Evolution of Disciplines SCOPUS",
    xaxis_title="Year",
    yaxis_title="Number of Theses",
    template="plotly"
)
figure.update_xaxes(categoryorder='category ascending')
figure.write_image(f"./results_new/temporal_evolution_students_disciplines.png")
figure.write_html(f"./results_new/temporal_evolution_students_disciplines.html")
figure.show()