In [1]:
import rdflib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx

from knowledge_graph import *
from graph import *
from rdf2vec import RDF2VecTransformer

from rdf_graph_utils import rdf_to_plot, rdf_to_text

from lcs_rdf_graph import LCS

In [3]:
# URI DI TUTTE LE DRUGS
all_drugs_file = pd.read_csv('data/all_drugs.tsv', sep='\t')
all_drugs = [rdflib.URIRef(x) for x in all_drugs_file['drug']]

# PREDICATI DA ESCLUDERE NELL'ESTRAZIONE DEL GRAFO
predicates = pd.read_csv('data/bad_predicates.tsv', sep='\t')
predicates = [rdflib.URIRef(x) for x in predicates['predicate']]

# ESTRAZIONE DI UNINFORMATIVE TRIPLES E STOPPING PATTERNS DAI FILE
preds = pd.read_csv('data/uninformative.tsv', sep='\t')
preds = [rdflib.URIRef(x) for x in preds['uninformative']]

stop_patterns = pd.read_csv('data/stop_patterns.tsv', sep='\t')
stop_patterns = [x for x in stop_patterns['stopping_patterns']]

In [4]:
print(end='Loading data... ', flush=True)
g = rdflib.Graph()

g.parse('data/drugbank_complete_nt.nt', format="nt")
print('OK')

Loading data... OK


In [5]:
# TRASFORMAZIONE DELL'INTERO GRAFO IN KNOWLEDGE GRAPH (contenente matrice di adiacenze, necessario per rdf2vec)
kg = rdflib_to_kg(g, label_predicates=predicates)

In [6]:
# PER OGNI DRUG VIENE ESTRATTO IL SUB-GRAPH SE PRESENTE NELLA NOSTRO KB
i = 1
j = 1

kv = []
drugs = []
graphs = []
for drug in all_drugs:
    try:
        g = extract_instance(kg, drug, 4)
        graphs.append(g)
        drugs.append(drug)
        kv.append( {'graph': g, 'resource': drug} )
        i += 1
    except Exception as e:
        j += 1
        
print('ok:' + str(i))
print('not imported: ' + str(j))

ok:56


In [None]:
# ESTRAZIONE DEGLI EMBEDDINGS TRAMITE RDF2VEC E CLUSTERING
# Embeddings
transformer = RDF2VecTransformer(wl=False, max_path_depth=4, vector_size=15, walks_per_graph=8000)
embeddings = transformer.fit_transform(graphs, all_drugs)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 


distortions = []
K = range(5,100)
for k in K:
    kmeanModel_3 = KMeans(n_clusters=k)
    preds = kmeanModel_3.fit_predict(embeddings)
    distortions.append(kmeanModel_3.inertia_)
    centers = kmeanModel_3.cluster_centers_

    score = silhouette_score(embeddings, preds)
    print("For n_clusters = {}, silhouette score is {})".format(k, score))
    
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
# CLUSTERING K-MEANS CON 7 CLUSTER
kmeans = KMeans(n_clusters=25)
k = kmeans.fit(embeddings)
y_kmeans = kmeans.predict(embeddings)

In [None]:
# STAMPO PER OGNI RISORSA IL CLUSTER A CUI CORRISPONDE, IN MODO DA POTERNE PRELEVARE PER IL MOMENTO
# DUE APPARTENENTI ALLO STESSO CLUSTER
k = 0
for y in y_kmeans:
    print(str(k) + ': ' + str(y))
    k += 1

In [7]:
# SELEZIONO I GRAFI RELATIVI A DUE DRUGS RISULTANTI NELLO STESSO CLUSTER
drug1 = drugs[48]
drug2 = drugs[50]

graph1 = graphs[48]
graph2 = graphs[50]

print(drug1)
print(drug2)

http://bio2rdf.org/drugbank:DB00182
http://bio2rdf.org/drugbank:DB00191


In [None]:
# CREAZIONE DEL GRAFO OTTENUTO DAL LCS
rdflib_x_Tx = LCS(graph1, graph2, depth=2, stop_patterns=stop_patterns, uninformative_triples=preds)
rdflib_x_Tx.find()

In [None]:
# RAPPRESENTAZIONE GRAFICA IN PLOT DEL GRAFO
rdf_to_plot(rdflib_x_Tx)

# SALVATAGGIO DEL GRAFO IN FORMATO 'NT'
rdf_to_text(rdflib_x_Tx, 'data/outputs', 'nt')

In [None]:
# SALVATAGGIO DEL GRAFO IN FORMATO 'NT'
rdf_to_text(rdflib_x_Tx, 'data/outputs', 'turtle')

In [None]:
# CHECK LENGTH  OF CLUSTERS
el_num = [0] * 25
for i in y_kmeans:
    el_num[i] += 1

for i in range(25):
    print(i, el_num[i])
    

Sezione per la rappresentazione grafica 2D e 3D dei cluster.

In [None]:
# ESTRAZIONE DELLE COMPONENTI PRINCIPALI PER POTER RAPPRESENTARE GRAFICAMENTE LA DISTINZIONE IN CLUSTER 
# 2 PC -> RAPPRESENTAZIONE 2D
# 3 PC -> RAPPRESENTAZIONE 3D

# PCA (2)
pca = PCA(n_components=2)
pca = pca.fit_transform(embeddings)

principalDf = pd.DataFrame(data=pca, columns=['pc1', 'pc2'])

In [None]:
# KMEANS SU LE 2 PC PER POTER INDIVIDUARE I PUNTI CENTRALI PER LA RAPPRESENTAZIONE GRAFICA (STEP NON NECESSARIO)
kmeans2 = KMeans(n_clusters=7)
kmeans2.fit(principalDf)

In [None]:
# rappresentazione grafica
plt.scatter(principalDf['pc1'], principalDf['pc2'], c=y_kmeans, s=50, cmap='viridis')

centers = np.asarray(kmeans2.cluster_centers_)
plt.scatter(centers[:,0], centers[:,1], c='black', s=200, alpha=0.5)

In [None]:
# PCA (3)
pca = PCA(n_components=3)
pca = pca.fit_transform(embeddings)

principalDf = pd.DataFrame(data=pca, columns=['pc1', 'pc2', 'pc3'])

kmeans2 = KMeans(n_clusters=7)
kmeans2.fit(principalDf)

In [None]:
fig = pyplot.figure()
ax = Axes3D(fig)
ax.scatter(principalDf['pc1'], principalDf['pc2'], principalDf['pc3'], c=y_kmeans, s=50, cmap='viridis')

centers = np.asarray(kmeans2.cluster_centers_)
ax.scatter(centers[:,0], centers[:,1], centers[:,2], c='black', s=200, alpha=0.5)

Test

In [None]:
for d in drugs:
    if d == rdflib.term.URIRef("http://bio2rdf.org/drugbank:DB00407"):
        print("ardeparin: " + str(drugs.index(d)))

for d in drugs:
    if d == rdflib.term.URIRef("http://bio2rdf.org/drugbank:DB01109"):
        print("heparin: " + str(drugs.index(d)))

for d in drugs:
    if d == rdflib.term.URIRef("http://bio2rdf.org/drugbank:DB00182"):
        print("a: " + str(drugs.index(d)))
    if d == rdflib.term.URIRef("http://bio2rdf.org/drugbank:DB00191"):
        print("b: " + str(drugs.index(d)))