# Semantic Similarity with Python

    
### Set up

In [1]:
import ssmpy
import pandas as pd

In [2]:
# ssmpy config
ssmpy.mica = True # determines if it uses MICA or DCA
ssmpy.intrinsic = False # determines if it uses extrinsic or intrinsic IC

### Functions

In [3]:
# IC calculation function
def ic(term):
    iri = ssmpy.get_id(term)
    # "GO_0055114" is not in GO because is OBSOLETE
    try:
        res = ssmpy.information_content(iri)
    except TypeError:
        res = 'NA in OWL (OBSOLETE)'
        print("'{}' term not found in the ontology because may be OBSOLETE".format(term))
    return res


### Upload the semantic base structure
##### Download GO and annotations 

In [4]:
%%bash

#wget http://purl.obolibrary.org/obo/go.owl
#wget http://geneontology.org/gene-associations/goa_uniprot_all_noiea.gaf.gz
#gunzip goa_uniprot_all_noiea.gaf.gz

In [7]:
# create the semantic base (done a few hours ago)
#ssmpy.create_semantic_base("go.owl", "go.db", "http://purl.obolibrary.org/obo/", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "goa_uniprot_all_noiea.gaf")

loading the ontology go.owl
calculating transitive closure at distance: 1
calculating transitive closure at distance: 2
calculating transitive closure at distance: 3
calculating transitive closure at distance: 4
calculating transitive closure at distance: 5
calculating transitive closure at distance: 6
calculating transitive closure at distance: 7
calculating transitive closure at distance: 8
calculating transitive closure at distance: 9
calculating transitive closure at distance: 10
calculating transitive closure at distance: 11
calculating transitive closure at distance: 12
calculating transitive closure at distance: 13
calculating transitive closure at distance: 14
calculating transitive closure at distance: 15
calculating transitive closure at distance: 16
calculating the frequency from file goa_uniprot_all_noiea.gaf
calculating the descendents
calculating the hierarchical frequency
the end


In [5]:
# upload the semantic base data structure
ssmpy.semantic_base("go.db")

### Upload input data

In [6]:
# read input data
path = "/home/nur/workspace/duchenne-paper-analyses/semantic-similarity/"
in_f_name = "termSummary10-GOBP-MaxSize5000-Summary.csv"
in_f = path + in_f_name
data = pd.read_csv(in_f)
print(data.shape)
data.head(2)

(155, 11)


Unnamed: 0,Representing term id,Representing term name,Representing term size,Representing term rank,Represented term number,Eleni-GOBP.csv term rank,Freddie-GOBP.txt term rank,Nazli-GOBP.txt term rank,MOGAMUN-GOBP.csv term rank,pathfindR-GOBP.csv term rank,EnrichNet-GOBP.csv term rank
0,GO:0006952,defense response,1823,1,184,1,7,1,9,2.0,5.0
1,GO:0007165,signal transduction,6290,1,1,100,1,91,578,,


### Compute the IC for the input data and save output

In [7]:
# compute IC
data_ss = data.copy()
data_ss['Representing term id'] = data_ss['Representing term id'].apply(lambda x: x.replace(':','_'))
data_ss['ss_ic'] = 1
data_ss = data_ss[['Representing term id', 'Representing term name', 'ss_ic',
       'Representing term size', 'Representing term rank',
       'Represented term number', 'Eleni-GOBP.csv term rank',
       'Freddie-GOBP.txt term rank', 'Nazli-GOBP.txt term rank',
       'MOGAMUN-GOBP.csv term rank', 'pathfindR-GOBP.csv term rank',
       'EnrichNet-GOBP.csv term rank']] 
data_ss['ss_ic'] = data_ss['Representing term id'].apply(ic)
data_ss = data_ss.rename(columns={'ss_ic':'Semantic Similarity (IC)'})
data_ss['Representing term id'] = data_ss['Representing term id'].apply(lambda x: x.replace('_',':'))
print(data_ss.shape)
data_ss.head(2)

'GO_0055114' term not found in the ontology because may be OBSOLETE
'GO_0042107' term not found in the ontology because may be OBSOLETE
(155, 12)


Unnamed: 0,Representing term id,Representing term name,Semantic Similarity (IC),Representing term size,Representing term rank,Represented term number,Eleni-GOBP.csv term rank,Freddie-GOBP.txt term rank,Nazli-GOBP.txt term rank,MOGAMUN-GOBP.csv term rank,pathfindR-GOBP.csv term rank,EnrichNet-GOBP.csv term rank
0,GO:0006952,defense response,4.01619,1823,1,184,1,7,1,9,2.0,5.0
1,GO:0007165,signal transduction,3.50282,6290,1,1,100,1,91,578,,


In [8]:
# save file to CSV
data_ss.to_csv('./termSummary10-GOBP-MaxSize5000-Summary-with-semantic-similarity2.csv', index=False)

In [9]:
# save file to xlsx
writer = pd.ExcelWriter('./termSummary10-GOBP-MaxSize5000-Summary-with-semantic-similarity2.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

ModuleNotFoundError: No module named 'xlsxwriter'

In [10]:
import xlsxwriter
writer = pd.ExcelWriter('./termSummary10-GOBP-MaxSize5000-Summary-with-semantic-similarity2.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

ModuleNotFoundError: No module named 'xlsxwriter'

In [None]:
# pip install XlsxWriter

In [12]:
import xlsxwriter
writer = pd.ExcelWriter('./termSummary10-GOBP-MaxSize5000-Summary-with-semantic-similarity2.xlsx', engine='xlsxwriter')
data_ss.to_excel(writer, sheet_name='Sheet1')
writer.save()