In [1]:
import tabula
import pdfplumber
import pandas as pd
import nltk
from nltk import word_tokenize
import re
from gensim.parsing.preprocessing import remove_stopwords
from service import Service
import docx

In [2]:
pdf = pdfplumber.open(r"Service Catalogues\SQ15-000107_A (4).pdf")
pdf

<pdfplumber.pdf.PDF at 0x23988656e88>

In [3]:
def extract_all_tables(pdf):
    
    dictionary_of_tables = {} 
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        for table in page.extract_tables():
            dictionary_of_tables['table'+str(i)] = pd.DataFrame(table[1:],columns=table[0],index=None)
    
    return dictionary_of_tables

In [4]:
dictionary_of_tables = extract_all_tables(pdf)
dictionary_of_tables

{'table4':                                              Service  \
 0  Australian \nGovernment \nDisaster Recovery \n...   
 1             Business Continuity \nPlan Development   
 2                      Business Continuity \nTesting   
 3                      Business Criticality \nReview   
 4               Business Impact \nAnalysis workshops   
 5                Monitoring \nbusiness \ndisruptions   
 6                                            Whispir   
 
                                          Description  \
 0  Represent partner departments on or provide as...   
 1  Provide assistance to customers to develop bus...   
 2  Develop and facilitate business continuity sce...   
 3  Review of critical activities by assessing the...   
 4  Conduct workshops to identify critical busines...   
 5  Provides a central point for the coordination ...   
 6  Whispir is an online communications tool utili...   
 
                         Service Level  
 0            As agreed with cust

In [5]:
def filtering(string):
    
    string = string.replace('\n','')
    string = re.sub('[^a-zA-Z ]+', '', string)
    string = remove_stopwords(string) 
    #print(string)
    
    return string

In [6]:
def get_services(dictionary_of_tables):
    
    list_of_services = []
    for i in list(dictionary_of_tables.keys()):
        df = dictionary_of_tables[i]
        for index,row in df.iterrows():
            list_of_services.append(Service(str(row['Service']),filtering(str(row['Description']))))
    
    return list_of_services

In [7]:
list_of_services = get_services(dictionary_of_tables)
list_of_services

[<service.Service at 0x2399796e2c8>,
 <service.Service at 0x23997926f88>,
 <service.Service at 0x23988457408>,
 <service.Service at 0x23997926288>,
 <service.Service at 0x2399796eec8>,
 <service.Service at 0x23997953e08>,
 <service.Service at 0x23997953ec8>,
 <service.Service at 0x23997915b88>,
 <service.Service at 0x23997953a08>,
 <service.Service at 0x23988691908>,
 <service.Service at 0x23997953608>,
 <service.Service at 0x23988691a48>,
 <service.Service at 0x239979751c8>,
 <service.Service at 0x239979755c8>,
 <service.Service at 0x23997975448>,
 <service.Service at 0x23997975688>,
 <service.Service at 0x23997975088>,
 <service.Service at 0x23997975048>,
 <service.Service at 0x23997975c88>,
 <service.Service at 0x239979758c8>,
 <service.Service at 0x23997975848>,
 <service.Service at 0x23997975c08>,
 <service.Service at 0x23997975388>,
 <service.Service at 0x23997975b08>,
 <service.Service at 0x23997975dc8>,
 <service.Service at 0x23997975348>,
 <service.Service at 0x23997975f88>,
 

In [8]:
doc = docx.Document(r"Service Catalogues\Government-Goals.docx")
list_of_goals = [i.text for i in doc.paragraphs]
list_of_goals

['Equip our people and Australian businesses with the skills necessary to deliver world-leading digital services.',
 'Adopt better ways of working that bring people together quickly and efficiently and reduce risk.',
 'Collaborate with other sectors, including small and medium-sized enterprises, community organisations and academia.',
 'Develop sustainable platforms that we can share across government.',
 'Deliver value for people and businesses by managing costs and risks.',
 'Services will be smart and adapt to the data you choose to share.',
 'Policy and services will draw on data and analytics.',
 'Advanced technologies will improve decision-making and be transparent and auditable.',
 'Earn your trust through being strong custodians of your data.',
 'Customers will be able to access all government services digitally.',
 'Customers will have seamlessly integrated services that support your needs and life events.',
 'Customers will be able to choose a secure and easy to use digital i

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 
import pandas as pd
import numpy as np
import sent2vec

In [10]:
labels = ['Contradiction','Neutral','Entailment']
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
#print(len(tokenizer))
model_rob = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def roberta_probability(sentence1,sentence2):
    
    #print(sentence1,sentence2)
    id1 = tokenizer.encode(sentence1,sentence2, padding = True, max_length=512, truncation='longest_first', return_tensors="pt")
    

    
    
    logits = model_rob.forward(id1, return_dict=True).logits

    p = torch.nn.functional.softmax(logits, dim=1)
    probs = p.tolist()[0]
    labels = ['Contradiction','Neutral','Entailment']
    
    result = {}
    for keys,values in zip(labels,probs):
        result[keys] = values
    
    result['Contradiction'] = result['Contradiction']*100
    result['Neutral'] = result['Neutral']*100
    result['Entailment']= result['Entailment']*100
    
    
    
    return result

In [12]:
#from sentence_transformers import SentenceTransformer
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
sbert_model = hub.load(module_url)
print ("module %s loaded" % module_url)

INFO:absl:Using C:\Users\indva\AppData\Local\Temp\tfhub_modules to cache modules.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [17]:
sentences = [i.description for i in list_of_services] + list_of_goals

In [18]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [19]:
sentence_embeddings = sbert_model(sentences)
len(sentence_embeddings)

293

In [20]:
semantic_matrix = []

for j in sentence_embeddings:
    column = []
    for i in sentence_embeddings:
        column.append(cosine(j,i))
    semantic_matrix.append(column)

In [21]:
semantic_matrix

[[1.0,
  0.14969154,
  0.07737297,
  0.196591,
  0.093639,
  0.21185067,
  0.13704687,
  0.19984691,
  0.27060387,
  0.13049574,
  0.10902286,
  0.1520962,
  0.20129938,
  0.14668061,
  0.15685597,
  0.1333605,
  0.20792054,
  0.10011783,
  0.16359633,
  0.17213283,
  0.07202378,
  0.031416845,
  -0.040356364,
  -0.073372245,
  0.031904854,
  0.024384225,
  -0.027805569,
  0.044665966,
  0.113942735,
  0.25010675,
  0.24035467,
  0.0075060595,
  0.31636432,
  0.073655054,
  0.25411552,
  0.0745895,
  0.24118681,
  0.15291563,
  0.099918656,
  0.15038003,
  0.07167351,
  0.09893313,
  0.09075075,
  0.34732002,
  0.17749013,
  0.22583951,
  0.10520295,
  0.14254545,
  -0.010966608,
  0.056458756,
  0.0670486,
  0.008479748,
  0.0860083,
  0.046366856,
  0.14000389,
  0.047015805,
  0.21018448,
  0.069788486,
  0.07793725,
  0.13488263,
  0.16247779,
  0.03159623,
  0.23754963,
  0.19437014,
  0.07211836,
  0.113957606,
  0.2081495,
  0.068407975,
  -0.0058073965,
  0.058796886,
  0.09603

In [27]:
import nmslib

NTHREADS = 8
def create_index(a):
    index = nmslib.init(space='cosinesimil')
    index.addDataPointBatch(a)
    index.createIndex()
    return index

def get_knns(index, vecs, k=3):
    return zip(*index.knnQueryBatch(vecs, k=k,num_threads=NTHREADS))

nn_wvs = create_index(sentence_embeddings)
#print(nn_wvs)
to_frame = lambda x: pd.DataFrame(np.array(x)[:,1:])
#print(to_frame)
idxs, dists = map(to_frame, get_knns(nn_wvs, sentence_embeddings, k=5))
catted = pd.concat([idxs.stack().to_frame('idx'), dists.stack().to_frame('dist')], axis=1).reset_index().drop('level_1',1).rename(columns={'level_0': 'v1', 'idx': 'v2'})

In [28]:
catted

Unnamed: 0,v1,v2,dist
0,0,9,0.846514
1,0,3,0.870476
2,0,5,0.912144
3,0,8,0.921245
4,1,0,0.759696
...,...,...,...
1167,291,5,0.686457
1168,292,9,0.434168
1169,292,11,0.438988
1170,292,5,0.769189


In [61]:
def create_query_vecs(goals):
    
    query_vecs = []
    
    for i in goals:
        query_vecs.append(sbert_model([i])[0])
    
    return query_vecs

In [62]:
query_vecs = create_query_vecs(list_of_goals)
len(query_vecs)

13

In [63]:
def create_similarity_dict(query_vecs,sentences):
    
    similarity_dict = {}
    index = 0
    for i in query_vecs:
        similarity_dict['Goal'+str(index)] = []
        index+=1
        for j in sentences:
            similarity_dict['Goal'+str(index-1)].append(cosine(i, sbert_model([j])[0]))

    return similarity_dict

In [64]:
similarity_dict = create_similarity_dict(query_vecs,sentences)

In [65]:
def get_topN_similarity(similarity_dict,N):
    top_similarity = {}
    for (key,value) in similarity_dict.items():
        servkey = (sorted(range(len(value)), key=lambda i:value[i])[:-N-1:-1])
        valsim = sorted(value)[:-N-1:-1]
        top_similarity[key] = [(i,j) for (i,j) in zip(servkey,valsim)]
        
    return top_similarity

goals_service_sim = get_topN_similarity(similarity_dict,10)
goals_service_sim

{'Goal0': [(39, 0.33283296),
  (178, 0.31909424),
  (198, 0.31481802),
  (180, 0.29052478),
  (36, 0.28780162),
  (0, 0.25991604),
  (44, 0.24864486),
  (1, 0.24030387),
  (172, 0.23467952),
  (236, 0.23307623)],
 'Goal1': [(79, 0.19348551),
  (62, 0.16177097),
  (171, 0.16068625),
  (193, 0.1584723),
  (113, 0.15780307),
  (105, 0.15780307),
  (88, 0.15780307),
  (75, 0.15780307),
  (70, 0.15780307),
  (121, 0.15521763)],
 'Goal2': [(156, 0.18536364),
  (97, 0.1522958),
  (94, 0.14913452),
  (194, 0.13367702),
  (54, 0.1302267),
  (188, 0.12720749),
  (1, 0.12422733),
  (233, 0.12336959),
  (60, 0.122653745),
  (143, 0.11872651)],
 'Goal3': [(62, 0.3077644),
  (206, 0.27241912),
  (20, 0.25716785),
  (1, 0.22206455),
  (61, 0.21235633),
  (60, 0.20910949),
  (134, 0.20336473),
  (209, 0.20292324),
  (231, 0.20050247),
  (22, 0.19275998)],
 'Goal4': [(59, 0.21732989),
  (94, 0.20297053),
  (141, 0.19977704),
  (265, 0.169562),
  (1, 0.16780242),
  (274, 0.1627271),
  (160, 0.16262588),

In [66]:
def get_postcond(sents):
    
    postconditions = ''
    
    for i in sents:
        
        postconditions += sentences[i] + '.'
        postconditions += ' '
        
    
    return postconditions
        
    

In [67]:
search_vectors = {}
for k,v in goals_service_sim.items():
    search_vectors[k] = []
    for i in v:
        df = catted[catted.v1 == i[0]]
        vector = df['v2'].tolist()
        search_vectors[k].append(get_postcond(vector))
search_vectors 

{'Goal0': ['Overseas Management provides advice support Pay Conditions activities associated employees working overseas. Advice domestic international travel policy travel rates incidental expenses frequently asked questions. Provision international mail freight processing handling. Provides setup modification removal access International phone calls. ',
  'Provision mail services National Office State Offices. Provision day mail processing handling delivery services. Processing distribution incoming mail. Provision classified mail processing handling services. ',
  'Practical advice support training digital accessibility align products services policy legislative requirements. Advice basic interpretation Commonwealth Procurement Grant Frameworks requirements including advice undertaking procurement grant activities access proforma documentation. Linux based cloud hosting environment websites web services external cloud infrastructure Infrastructure tailored support primary website pla

In [68]:
def get_cluster_prob(search_vectors):
    prob_dict = {}
    for k,v in search_vectors.items():
        prob_dict[k] = []
        for i in v:
            goal = list_of_goals[int(k[-1])]
            prob = roberta_probability(i,goal)
            prob_dict[k].append(prob['Entailment'])
            
    return prob_dict
        

In [69]:
prob_dict = get_cluster_prob(search_vectors)
prob_dict

{'Goal0': [0.6953915115445852,
  2.2957801818847656,
  10.878563672304153,
  4.472941532731056,
  4.748795926570892,
  0.6565249059349298,
  7.43485763669014,
  0.7959520444273949,
  4.304185509681702,
  3.7830431014299393],
 'Goal1': [6.9458238780498505,
  5.345562472939491,
  6.797633320093155,
  9.250569343566895,
  2.748616971075535,
  2.748616971075535,
  2.748616971075535,
  2.748616971075535,
  2.748616971075535,
  4.847944900393486],
 'Goal2': [0.8688276633620262,
  7.596217840909958,
  1.1807339265942574,
  1.5518910251557827,
  0.6490922067314386,
  1.367800123989582,
  0.09387473692186177,
  0.8397049270570278,
  0.41078776121139526,
  1.5280102379620075],
 'Goal3': [1.2887376360595226,
  0.7132899481803179,
  4.136630147695541,
  0.7171529345214367,
  1.3629775494337082,
  1.5738625079393387,
  0.9567162953317165,
  9.334921836853027,
  5.96855953335762,
  3.3278629183769226],
 'Goal4': [37.286177277565,
  61.951398849487305,
  29.920318722724915,
  34.405407309532166,
  10