**ΕΡΓΑΣΙΑ ΓΙΑ ΤΟ ΜΑΘΗΜΑ ΘΕΩΡΙΑ ΔΙΚΤΥΩΝ ΜΕ ΘΕΜΑ ΤΗΝ ΕΥΡΕΣΗ ΚΟΙΝΟΤΗΤΩΝ**
<br>ΘΩΜΑΣ ΚΥΡΙΑΚΟΣ ΠΡΑΒΙΝΟΣ 
<br>ΑΕΜ : 9937

In [1]:
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import community.community_louvain as community_louvain
from networkx.algorithms.community import greedy_modularity_communities

df = pd.read_csv('train.csv')
df = df.head(500)

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate sentence embeddings for the articles using their Titles 
sentences = df['TITLE'].tolist()
embeddings = model.encode(sentences)

# Create a NetworkX graph
G = nx.Graph()

# Add nodes to the graph, one for each article
for i, row in df.iterrows():
    G.add_node(row['ID'], title=row['TITLE'])

# Add edges to the graph, connecting nodes that are similar according the cosine similarity between their embeddings
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        # Calculate the similarity between the ith and jth articles
        # using the cosine similarity between their embeddings
        sim = cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))[0][0]
        
        # If the similarity is above some threshold, add an edge between the nodes
        if sim > 0.2:
            G.add_edge(df.iloc[i]['ID'], df.iloc[j]['ID'], weight=sim)



In [2]:
from collections import Counter
import numpy as np
import random

np.random.seed(0)

# Use the Louvain algorithm to find the communities in the graph
partition = community_louvain.best_partition(G, weight = 'weight')
print("Modularity: ", community_louvain.modularity(partition, G))

# Count the number of nodes in each community
community_count = Counter(partition.values())

# Print the communities found in the graph
prediction=[]
for com in set(partition.values()) :
    list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
    prediction.append(list_nodes)
    print("Community ", com+1, ": ", list_nodes, ", count: ", community_count[com])

print(prediction)
nx.set_node_attributes(G, partition, 'community')

nx.write_gexf(G, 'graph_title.gexf')

Modularity:  0.3730438520678947
Community  1 :  [1, 2, 21, 27, 35, 40, 45, 46, 49, 54, 59, 61, 66, 69, 82, 96, 97, 101, 108, 109, 112, 117, 125, 151, 169, 178, 179, 192, 193, 198, 205, 206, 211, 217, 220, 221, 226, 227, 232, 234, 238, 240, 247, 250, 254, 255, 256, 258, 261, 270, 272, 275, 277, 280, 287, 294, 299, 303, 313, 314, 317, 318, 322, 329, 330, 337, 345, 347, 355, 360, 361, 366, 381, 384, 397, 401, 405, 407, 408, 410, 418, 426, 430, 432, 434, 436, 437, 439, 445, 451, 453, 464, 468, 476, 477, 485, 498] , count:  97
Community  2 :  [3, 6, 14, 18, 25, 28, 32, 47, 55, 67, 68, 70, 79, 84, 88, 98, 99, 105, 107, 111, 113, 129, 130, 133, 134, 143, 144, 147, 148, 152, 153, 154, 155, 163, 164, 170, 171, 176, 184, 188, 200, 202, 207, 210, 213, 218, 219, 236, 249, 276, 284, 288, 302, 304, 310, 312, 316, 321, 333, 340, 341, 343, 348, 352, 353, 354, 356, 365, 368, 371, 375, 382, 388, 389, 391, 400, 406, 414, 415, 416, 417, 423, 424, 429, 433, 435, 440, 443, 449, 450, 454, 459, 462, 488, 490,

In [3]:
from sklearn.metrics import fowlkes_mallows_score

df = df.assign(Category=lambda x: 'None')
df['Category'] = df.apply(lambda x: 'Computer Science' if x['Computer Science'] == 1 else 'Physics' if x['Physics'] == 1 else 'Mathematics' if x['Mathematics'] == 1 else 'Statistics' if x['Statistics'] == 1 else 'Quantitative Biology' if x['Quantitative Biology'] == 1 else 'Quantitative Finance' if x['Quantitative Finance'] == 1 else 'None', axis=1)

category_counts = df['Category'].value_counts()
category_groups = df.groupby('Category')
category_lists = [df.loc[df['Category'] == category, 'ID'].tolist() for category in category_counts.index]


true_categories = [0 for i in range(500)]
for k in range(len(category_lists)):
    for i in range(len(category_lists[k])):
        true_categories[category_lists[k][i]-1] = k+1


predicted_communties = [0 for i in range(500)]
for k in range(len(prediction)):
    for i in range(len(prediction[k])):
        predicted_communties[prediction[k][i]-1] = k+1




print(f"true:{true_categories}\n\npredicted:{predicted_communties}")

Community_Score = fowlkes_mallows_score(true_categories,predicted_communties)
#print(Community_Score)
# Convert score to percentage
percentage = Community_Score * 100
print("Fowlkes-Mallows score as percentage: {:.2f}%".format(percentage))


true:[1, 1, 3, 3, 1, 3, 2, 2, 2, 5, 1, 2, 2, 1, 2, 3, 2, 3, 4, 2, 5, 2, 1, 1, 3, 2, 1, 3, 3, 2, 1, 3, 1, 5, 1, 1, 2, 1, 2, 1, 4, 6, 2, 1, 1, 1, 3, 1, 1, 4, 1, 2, 1, 1, 1, 5, 2, 1, 1, 1, 1, 1, 2, 2, 2, 4, 3, 3, 1, 3, 3, 2, 1, 1, 2, 3, 1, 2, 3, 2, 4, 1, 1, 3, 1, 2, 2, 3, 2, 4, 2, 2, 3, 2, 2, 1, 1, 3, 3, 2, 1, 3, 1, 2, 1, 2, 3, 1, 1, 1, 2, 1, 3, 2, 1, 5, 1, 1, 1, 3, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 3, 3, 4, 2, 3, 2, 5, 1, 1, 2, 3, 2, 3, 2, 3, 3, 5, 1, 1, 3, 3, 3, 3, 2, 3, 1, 1, 1, 1, 2, 1, 3, 2, 4, 2, 2, 1, 3, 1, 5, 3, 1, 1, 1, 4, 1, 1, 1, 1, 2, 2, 3, 4, 2, 1, 3, 1, 2, 2, 1, 1, 3, 3, 5, 1, 1, 2, 3, 2, 3, 1, 4, 1, 1, 3, 1, 2, 3, 1, 4, 3, 4, 1, 2, 4, 2, 3, 2, 1, 2, 5, 1, 1, 3, 5, 1, 1, 4, 2, 1, 1, 1, 1, 3, 2, 4, 2, 2, 2, 3, 4, 3, 2, 1, 1, 3, 3, 1, 3, 1, 1, 1, 1, 4, 1, 1, 1, 3, 5, 1, 2, 4, 1, 1, 4, 2, 6, 1, 3, 4, 3, 2, 1, 3, 1, 1, 1, 1, 4, 2, 1, 3, 1, 4, 4, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 3, 1, 3, 2, 4, 1, 4, 3, 3, 2, 2, 1, 1, 2, 1, 1, 1, 1, 4, 3, 1, 2, 1, 4, 3, 2, 1, 1, 4, 3, 1

In [4]:
df = pd.read_csv('train.csv')
df = df.head(500)

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate sentence embeddings for the articles using their Abstracts 
sentences = df['ABSTRACT'].tolist()
embeddings = model.encode(sentences)

# Create a NetworkX graph
G = nx.Graph()

# Add nodes to the graph, one for each article
for i, row in df.iterrows():
    G.add_node(row['ID'], title=row['TITLE'])

# Add edges to the graph, connecting nodes that are similar according the cosine similarity between their embeddings
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        # Calculate the similarity between the ith and jth articles
        # using the cosine similarity between their embeddings
        sim = cosine_similarity(embeddings[i].reshape(1, -1), embeddings[j].reshape(1, -1))[0][0]
        
        # If the similarity is above some threshold, add an edge between the nodes
        if sim > 0.2:
            G.add_edge(df.iloc[i]['ID'], df.iloc[j]['ID'], weight=sim)


np.random.seed(0)

# Use the Louvain algorithm to find the communities in the graph
partition = community_louvain.best_partition(G, weight = 'weight')
# print("Modularity: ", community_louvain.modularity(partition, G))

# Count the number of nodes in each community
community_count = Counter(partition.values())

# Print the communities found in the graph
prediction=[]
for com in set(partition.values()) :
    list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
    prediction.append(list_nodes)
    #print("Community ", com+1, ": ", list_nodes, ", count: ", community_count[com])

# print(prediction)
nx.set_node_attributes(G, partition, 'community')

nx.write_gexf(G, 'graph_abstract.gexf')


from sklearn.metrics import fowlkes_mallows_score

df = df.assign(Category=lambda x: 'None')
df['Category'] = df.apply(lambda x: 'Computer Science' if x['Computer Science'] == 1 else 'Physics' if x['Physics'] == 1 else 'Mathematics' if x['Mathematics'] == 1 else 'Statistics' if x['Statistics'] == 1 else 'Quantitative Biology' if x['Quantitative Biology'] == 1 else 'Quantitative Finance' if x['Quantitative Finance'] == 1 else 'None', axis=1)

category_counts = df['Category'].value_counts()
category_groups = df.groupby('Category')
category_lists = [df.loc[df['Category'] == category, 'ID'].tolist() for category in category_counts.index]


true_categories = [0 for i in range(500)]
for k in range(len(category_lists)):
    for i in range(len(category_lists[k])):
        true_categories[category_lists[k][i]-1] = k+1


predicted_communties = [0 for i in range(500)]
for k in range(len(prediction)):
    for i in range(len(prediction[k])):
        predicted_communties[prediction[k][i]-1] = k+1

# print(true_categories)
# print(predicted_communties)

Community_Score = fowlkes_mallows_score(true_categories,predicted_communties)
# Convert score to percentage
percentage = Community_Score * 100
print("Fowlkes-Mallows score as percentage: {:.2f}%".format(percentage))

Fowlkes-Mallows score as percentage: 56.38%
