In [48]:
import pandas as pd
import plotly.express as px
import plotly
import seaborn as sns
import plotly.figure_factory as ff

### Podstawowe parametry sieci

```
V = 1211
E = 15233
Connected components = 4
Avg number of neighbors = 13.004
Diameter = 11
Radius = 6
Characteristic path length = 3.915
Clustering coefficient = 0.465
Network density = 0.011
Network centralization = 0.036
```

In [49]:
# Na każdym plocie używam takich kolorów:

faculty_colors = ["#E58606", "#5D69B1", "#52BCA3", "#99C945", "#CC61B0", "#24796C", "#DAA51B", "#2F8AC4", "#764E9F", "#ED645A", "#BEAED4", "#FDE725"]
FACULTIES = ["W02", "W03", "W04", "W05", "W06", "W07", "W08", "W09", "W10", "W11", "W12", "W13"]

FACULTY_COLOR_MAP = {faculty: color for faculty, color in zip(FACULTIES, faculty_colors)}

### Ranking osób (wierzchołków) względem miar centralności

In [50]:
person_ranking = pd.read_csv("r-complex-networks/people_ranking.csv")
person_ranking

Unnamed: 0,node_id,node_label,faculty,component,degree,betweenness_centrality,closeness_centrality,eigen_centrality
0,115,Chlebus Edward,W10,1,111,127765.000000,0.033838,1.132636e-02
1,255,Grosel Jacek,W02,1,111,43289.571429,0.030957,1.569201e-03
2,665,Misiewicz Jan,W11,1,103,83419.000000,0.032079,1.000000e+00
3,1159,Wójcicki Zbigniew,W02,1,97,8782.500000,0.030521,1.497761e-03
4,78,Bożejko Wojciech,W04,1,88,17783.000000,0.032307,4.856717e-03
...,...,...,...,...,...,...,...,...
1204,946,Sokolski Piotr,W10,1,2,0.000000,0.032334,1.299244e-03
1205,1013,Szul-Pietrzak Elżbieta,W11,1,2,0.000000,0.022866,1.319219e-02
1206,1085,Walkowiak-Gall Anita,W04,1,2,0.000000,0.012411,6.472474e-04
1207,1115,Więcek Piotr,W13,4,2,0.000000,1.000000,8.606682e-17


### Rozkłady:

In [51]:
px.histogram(person_ranking, x="degree", marginal="box")

In [52]:
px.histogram(person_ranking, x="betweenness_centrality", marginal="box", log_y=True)

In [53]:
px.histogram(person_ranking, x="closeness_centrality", marginal="box", log_y=True)

In [54]:
px.histogram(person_ranking, x="eigen_centrality", marginal="box", log_y=True)

### Top N osób z największym gronem kolaborantów

In [55]:
top_20_by_degree = person_ranking.sort_values(by="degree", ascending=False).head(20)

In [56]:
fig = px.bar(top_20_by_degree, y="degree", x="node_label", color="faculty", color_discrete_map=FACULTY_COLOR_MAP)
fig.update_layout(barmode="stack", xaxis={"categoryorder": "total descending"})
fig.show()

In [133]:
top_20_by_degree.head(12)

Unnamed: 0,node_id,node_label,faculty,component,degree,betweenness_centrality,closeness_centrality,eigen_centrality
0,115,Chlebus Edward,W10,1,111,127765.0,0.033838,0.011326
1,255,Grosel Jacek,W02,1,111,43289.571429,0.030957,0.001569
2,665,Misiewicz Jan,W11,1,103,83419.0,0.032079,1.0
3,1159,Wójcicki Zbigniew,W02,1,97,8782.5,0.030521,0.001498
4,78,Bożejko Wojciech,W04,1,88,17783.0,0.032307,0.004857
5,840,Reiner Jacek,W10,1,87,8980.5,0.031391,0.00988
6,583,Lis Robert,W05,1,86,15717.5,0.032087,0.001834
7,605,Madryas Cezary,W02,1,84,16582.0,0.03048,0.002167
8,1214,Łydżba Dariusz,W02,1,84,4665.0,0.028843,0.001308
10,1199,Ziółkowski Grzegorz,W10,1,83,12804.0,0.031221,0.005784


### Top N osób z najmniejszym gronem kolaborantów

In [57]:
bottom_20_by_degree = person_ranking.sort_values(by="degree", ascending=True).head(20)

In [58]:
fig = px.bar(bottom_20_by_degree, y="degree", x="node_label", color="faculty", color_discrete_map=FACULTY_COLOR_MAP)
fig.update_layout(barmode="stack", xaxis={"categoryorder": "total ascending"})
fig.show()

### Top N osób z największą liczbą prac (dorobek naukowy)

In [130]:
papers_count = pd.read_csv("clean_data/all/all_weighted.csv")
papers_count = pd.DataFrame(papers_count["source"].value_counts().rename("num_papers"))
papers_ranking = papers_count.join(person_ranking.set_index("node_id")).sort_values("num_papers", ascending=False)
papers_ranking

Unnamed: 0,num_papers,node_label,faculty,component,degree,betweenness_centrality,closeness_centrality,eigen_centrality
115,55,Chlebus Edward,W10,1,111,127765.000000,0.033838,1.132636e-02
255,55,Grosel Jacek,W02,1,111,43289.571429,0.030957,1.569201e-03
665,52,Misiewicz Jan,W11,1,103,83419.000000,0.032079,1.000000e+00
1159,48,Wójcicki Zbigniew,W02,1,97,8782.500000,0.030521,1.497761e-03
78,44,Bożejko Wojciech,W04,1,88,17783.000000,0.032307,4.856717e-03
...,...,...,...,...,...,...,...,...
524,1,Kuchta Małgorzata,W13,1,2,0.000000,0.006518,4.894774e-04
300,1,Helowicz Andrzej,W02,1,2,0.000000,0.026452,6.118094e-04
658,1,Mierczyński Janusz,W13,1,2,0.000000,0.000816,7.234331e-04
653,1,Michalik Krzysztof,W13,2,2,0.000000,0.492336,0.000000e+00


In [129]:
fig = px.bar(papers_ranking.head(20), y="num_papers", x="node_label", color="faculty", color_discrete_map=FACULTY_COLOR_MAP)
fig.update_layout(barmode="stack", xaxis={"categoryorder": "total descending"})
fig.show()

### Liczba wierzchołków per wydział

Rozmiary wydziałów

In [59]:
fig = px.bar(person_ranking["faculty"].value_counts().reset_index(), y="faculty", x="index", color="index", color_discrete_map=FACULTY_COLOR_MAP)
fig.update_xaxes(categoryorder='array', categoryarray= FACULTIES)
fig.show()

### Rozkład stopni wierzchołka per wydział

In [60]:
fig = px.box(person_ranking, x="faculty", y="degree", color="faculty", color_discrete_map=FACULTY_COLOR_MAP)
fig.update_xaxes(categoryorder='array', categoryarray= FACULTIES)
fig.show()

### Istniejące wydziały - Community structure modularity and split join dist

| Community detection | Number of components | Community structure modularity | Split join distance |
|---------------------|----------------------|--------------------------------|---------------------|
| Walktrap            | 45                   | 0.6794                         | 530/184             |
| Eigenvector         | 31                   | 0.6373                         | 523/306             |
| Louvain             | 29                   | 0.6812                         | 600/234             |
| Infomap             | 901                  | 0.02591                        | 1162/130            |
| Spinglass(10)       | 11                   | 0.6589                         | 489/446             |
| Spinglass(11)       | 12                   | 0.6300                         | 547/556             |
| **Faculty**         | **12**               | **0.6850**                     | **---**             |
| Spinglass(12)       | 13                   | 0.6768                         | 495/409             |
| Spinglass(14)       | 14                   | 0.6719                         | 540/462             |


Próbowano powyższych algorytmów klastrowania, ale ostatecznie wybrano spinglass, bo pozwala na ograniczenie z góry liczby komponentów na które dzielony jest graf

[Dokumentacja split join](https://search.r-project.org/CRAN/refmans/igraph/html/split_join_distance.html)


First, each set in partition A is evaluated against all sets in partition B. For each set in partition A, the best matching set in partition B is found and the overlap size is calculated. (Matching is quantified by the size of the overlap between the two sets). Then, the maximal overlap sizes for each set in A are summed together and subtracted from the number of elements in A.

The split-join distance will be returned as two numbers, the first is the projection distance of the first partition from the second, while the second number is the projection distance of the second partition from the first. This makes it easier to detect whether a partition is a subpartition of the other, since in this case, the corresponding distance will be zero.

_Tbh nie wiem który z tych odległości w split-join może być sensowniejszy_

[Teoria do modularity](https://en.wikipedia.org/wiki/Modularity_(networks)#:~:text=Networks%20with%20high%20modularity%20have,detecting%20community%20structure%20in%20networks.)
[Modularity v2](https://sparkling-graph.readthedocs.io/en/latest/modularity.html)

### Istniejące wydziały - Odległość między wydziałami

Nie wiem jeszcze czy da się to policzyć ale warto wspomnieć, że np. W2 jest oddalone od W10 bardziej niż np. W4, W8 - spróbuję tu wstawić jakąś macierz

### Spinglass clusters

Metodologia: Główny komponent podzielono za pomocą spinglass - reszta (pozostałe 3 komponenty) dodano jako osobny klaster (to było W13) - bo spinglass nie wspiera klastrowania na niespójnych grafach

In [93]:
def calculate_iou_scores(persons_df: pd.DataFrame, faculties: list[str] = FACULTIES):
    result = pd.DataFrame()
    # result["faculty"] = faculties

    community_ids = sorted(pd.unique(persons_df["community_id"]))
    for faculty in faculties:
        people_from_faculty = set(persons_df[persons_df["faculty"] == faculty]["node_id"])
        faculty_scores = []
        for i in community_ids:
            people_from_cluster = set(persons_df[persons_df["community_id"] == i]["node_id"])
            intersection = people_from_faculty.intersection(people_from_cluster)
            union = people_from_faculty.union(people_from_cluster)
            iou = len(intersection) / len(union)
            faculty_scores.append(iou)
        result[faculty] = faculty_scores
    return result

### Heatmap IoU dla znalezionych klastrów

Na osi x - faktycznie istniejące wydziały, na y - klastry, w środku score IoU

Wersje dla różnej liczby klastrów spinglass dają różne efekty:

In [94]:
communities_sg_9 = pd.read_csv("r-complex-networks/spinglass_extended_9.csv")
spinglass_9_iou = calculate_iou_scores(communities_sg_9)
spinglass_9_iou

Unnamed: 0,W02,W03,W04,W05,W06,W07,W08,W09,W10,W11,W12,W13
0,0.0,0.015564,0.003096,0.005181,0.0,0.596899,0.0,0.061856,0.109635,0.0,0.0,0.005747
1,0.776119,0.0,0.006494,0.0,0.011696,0.0,0.0,0.0,0.012658,0.0,0.0,0.018987
2,0.056075,0.093023,0.010169,0.0,0.0,0.0,0.0,0.0,0.136531,0.079755,0.079365,0.020548
3,0.011494,0.0,0.3125,0.061856,0.0,0.0,0.005076,0.0,0.002899,0.0,0.21608,0.005376
4,0.0,0.0,0.011268,0.045662,0.0,0.0,0.018433,0.004167,0.464286,0.072398,0.043307,0.004785
5,0.008511,0.069565,0.292887,0.0,0.0,0.0,0.017857,0.02139,0.032362,0.005376,0.014151,0.032258
6,0.0,0.413613,0.012158,0.0,0.0,0.0,0.0,0.32716,0.005865,0.0,0.0,0.0
7,0.007168,0.0,0.063253,0.013636,0.35,0.0,0.295181,0.030702,0.0,0.022124,0.003876,0.073684
8,0.0,0.052419,0.031847,0.251613,0.0,0.0,0.0,0.0,0.003003,0.231707,0.138614,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391304


In [95]:
px.imshow(spinglass_9_iou,
          x=FACULTIES,
          text_auto=".3f",
          labels=dict(x="Faculty", y="Proposed cluster"),
          title="Spinglass community structure (10 communities)",
          color_continuous_scale="RdBu_r")

In [96]:
communities_sg_10 = pd.read_csv("r-complex-networks/spinglass_extended_10.csv")
spinglass_10_iou = calculate_iou_scores(communities_sg_10)
spinglass_10_iou

Unnamed: 0,W02,W03,W04,W05,W06,W07,W08,W09,W10,W11,W12,W13
0,0.0,0.007905,0.308642,0.0,0.0,0.0,0.005587,0.226994,0.006135,0.0,0.004484,0.030488
1,0.0,0.0,0.0,0.155039,0.0,0.0,0.084615,0.059211,0.146825,0.032895,0.010929,0.0
2,0.007692,0.0,0.274809,0.004926,0.0,0.0,0.0,0.0,0.002915,0.049505,0.276596,0.005435
3,0.732877,0.0,0.009317,0.0,0.010753,0.0,0.0,0.0,0.043614,0.00495,0.0,0.017341
4,0.0,0.823529,0.011834,0.004739,0.0,0.0,0.0,0.056604,0.0,0.018519,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.810526,0.0,0.0,0.056338,0.005988,0.0,0.007143
6,0.004975,0.0,0.003663,0.0,0.691358,0.0,0.0,0.04,0.003534,0.013333,0.0,0.106195
7,0.0,0.0,0.049342,0.277027,0.0,0.0,0.321168,0.01005,0.06129,0.0,0.0,0.017964
8,0.051282,0.015936,0.009524,0.0,0.0,0.0,0.0,0.04712,0.018634,0.281046,0.258427,0.0
9,0.003717,0.0,0.06875,0.009524,0.0,0.0,0.004926,0.009009,0.44856,0.032864,0.0,0.015789


In [97]:
px.imshow(spinglass_10_iou,
          x=FACULTIES,
          text_auto=".3f",
          labels=dict(x="Faculty", y="Proposed cluster"),
          title="Spinglass community structure (11 communities)",
          color_continuous_scale="RdBu_r")

In [98]:
communities_sg_11 = pd.read_csv("r-complex-networks/spinglass_extended_11.csv")
spinglass_11_iou = calculate_iou_scores(communities_sg_11)
spinglass_11_iou

Unnamed: 0,W02,W03,W04,W05,W06,W07,W08,W09,W10,W11,W12,W13
0,0.0,0.330144,0.008876,0.0,0.0,0.0,0.0,0.051887,0.005731,0.183784,0.122727,0.0
1,0.78626,0.0,0.003279,0.0,0.011976,0.0,0.0,0.0,0.012821,0.0,0.0,0.00641
2,0.0,0.0,0.04811,0.086957,0.0,0.0,0.0,0.0,0.285714,0.0,0.060302,0.0
3,0.0,0.051948,0.238866,0.011494,0.0,0.0,0.0,0.010753,0.03268,0.0,0.104167,0.039735
4,0.009302,0.0,0.313636,0.089041,0.0,0.0,0.006667,0.04908,0.003356,0.0,0.0,0.0
5,0.058036,0.0,0.03,0.034682,0.0,0.0,0.0,0.104046,0.208333,0.056497,0.0,0.019108
6,0.0,0.0,0.017483,0.012579,0.0,0.0,0.085106,0.0,0.194444,0.097403,0.064865,0.007092
7,0.004292,0.021008,0.023411,0.0,0.495575,0.0,0.0,0.08046,0.0,0.022222,0.06,0.082759
8,0.0,0.014085,0.003597,0.013605,0.0,0.916667,0.0,0.0,0.0,0.0,0.0,0.007752
9,0.014218,0.232044,0.007042,0.0,0.0,0.0,0.0,0.166667,0.049645,0.0,0.032258,0.0


In [99]:
px.imshow(spinglass_11_iou,
          x=FACULTIES,
          text_auto=".3f",
          labels=dict(x="Faculty", y="Proposed cluster"),
          title="Spinglass community structure (12 communities)",
          color_continuous_scale="RdBu_r")

In [100]:
communities_sg_12 = pd.read_csv("r-complex-networks/spinglass_extended_12.csv")
spinglass_12_iou = calculate_iou_scores(communities_sg_12)
spinglass_12_iou

Unnamed: 0,W02,W03,W04,W05,W06,W07,W08,W09,W10,W11,W12,W13
0,0.0,0.0,0.014493,0.2,0.0,0.0,0.051852,0.396552,0.00346,0.0,0.0,0.015504
1,0.0,0.397906,0.01227,0.0,0.0,0.0,0.021277,0.0,0.16041,0.019608,0.0,0.0
2,0.0,0.019231,0.00365,0.0,0.0,0.026144,0.007353,0.0,0.003521,0.366071,0.167742,0.016129
3,0.0,0.02551,0.023256,0.135593,0.0,0.0,0.0,0.0,0.181034,0.0,0.0,0.0
4,0.007812,0.007547,0.067961,0.005025,0.408759,0.0,0.21519,0.0,0.00295,0.024631,0.0,0.077381
5,0.744681,0.040816,0.006329,0.005348,0.011173,0.0,0.0,0.0,0.006135,0.0,0.0,0.005952
6,0.0,0.164894,0.029197,0.041096,0.0,0.018634,0.0,0.0,0.017422,0.019108,0.175,0.023077
7,0.0,0.0,0.259259,0.014286,0.0,0.0,0.072,0.0,0.0,0.056338,0.011364,0.0
8,0.008621,0.0,0.318966,0.08642,0.0,0.0,0.012048,0.0,0.003175,0.0,0.065327,0.032895
9,0.056075,0.017316,0.017065,0.0,0.0,0.636364,0.0,0.065089,0.0,0.0,0.0,0.006757


In [92]:
px.imshow(spinglass_12_iou,
          x=FACULTIES,
          text_auto=".3f",
          labels=dict(x="Faculty", y="Proposed cluster"),
          title="Spinglass community structure (13 communities)",
          color_continuous_scale="RdBu_r")

In [134]:
communities_sg_11_hubs_removed = pd.read_csv("r-complex-networks/spinglass_11_hubs_removed.csv")
spinglass_11_hubs_removed_iou = calculate_iou_scores(communities_sg_11_hubs_removed)
spinglass_11_hubs_removed_iou

Unnamed: 0,W02,W03,W04,W05,W06,W07,W08,W09,W10,W11,W12,W13
0,0.0,0.016064,0.290984,0.005435,0.0,0.0,0.0,0.0,0.003106,0.005208,0.226519,0.012121
1,0.628931,0.0,0.073955,0.009901,0.010204,0.0,0.005102,0.0,0.005882,0.034146,0.004167,0.01087
2,0.0,0.0,0.334884,0.012903,0.0,0.0,0.013514,0.011905,0.017241,0.0,0.005181,0.069231
3,0.004405,0.029915,0.0,0.0,0.0,0.207792,0.0,0.419847,0.00974,0.0,0.055276,0.0
4,0.004739,0.0181,0.021352,0.00641,0.0,0.0,0.0,0.0,0.266094,0.114865,0.0,0.014599
5,0.0,0.0,0.013514,0.287879,0.0,0.0,0.417391,0.0,0.006536,0.022989,0.045455,0.006623
6,0.060185,0.0,0.027027,0.074074,0.0,0.316901,0.012121,0.1,0.003215,0.0,0.055,0.006452
7,0.0,0.676259,0.003401,0.0,0.0,0.0,0.0,0.0,0.0,0.035928,0.0,0.0
8,0.004739,0.046512,0.003497,0.0,0.589474,0.0,0.0,0.011905,0.027875,0.024845,0.0,0.085938
9,0.014218,0.060748,0.014035,0.0,0.0,0.0,0.0,0.0,0.053191,0.246269,0.159763,0.0


In [139]:
px.imshow(spinglass_11_hubs_removed_iou,
          x=FACULTIES,
          text_auto=".3f",
          labels=dict(x="Faculty", y="Proposed cluster"),
          title="12 hubs removed - Spinglass community structure - (12 communities)",
          color_continuous_scale="RdBu_r")