## Concept/Domain coverage analysis: Matching with App Domains

This notebook extracts corpus keywords and analyse the concepts emerging from three sources: source code, documentation and tests

## Show topics

In [1]:
import pandas as pd
# pd.set_option('display.max_colwidth', None)

topics_df = pd.read_csv("topics_res_df.csv")
# topics_df = topics_df[["project_name", "module", "code_topics", "test_topics", "doc_topics"]]
topics_df[20:25]

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
20,03_flink,flink-clients,8,9,"[(0, [('jar', 0.10477287), ('url', 0.06690521)...","[['jar', 'url', 'entry', 'program', 'setting',...","[(0, [('cluster', 0.11344488), ('factory', 0.0...","[['jar', 'url', 'entry', 'program', 'setting',...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
21,03_flink,flink-connectors,9,9,"[(0, [('split', 0.17729564), ('source', 0.0911...","[['split', 'source', 'reader', 'hive', 'partit...","[(0, [('kafka', 0.047333054), ('partition', 0....","[['split', 'source', 'reader', 'hive', 'partit...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
22,03_flink,flink-container,5,6,"[(0, [('cluster', 0.043478318), ('application'...","[['cluster', 'application', 'entry', 'line', '...","[(0, [('application', 0.08333337), ('standalon...","[['cluster', 'application', 'entry', 'line', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
23,03_flink,flink-contrib,9,8,"[(0, [('event', 0.015625862), ('edit', 0.01559...","[['event', 'edit', 'diff', 'timestamp', 'chann...","[(0, [('context', 0.04166667), ('source', 0.04...","[['event', 'edit', 'diff', 'timestamp', 'chann...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."
24,03_flink,flink-core,7,9,"[(0, [('key', 0.10738443), ('comparator', 0.09...","[['key', 'comparator', 'value', 'normalize', '...","[(0, [('map', 0.13233617), ('integer', 0.10897...","[['key', 'comparator', 'value', 'normalize', '...",9,"[(0, [('scala', 0.0055014915), ('processing', ...","[['scala', 'processing', 'java', 'intellij', '..."


## Show annotated modules

In [2]:
anno_df = pd.read_csv("module_annotation.csv")
anno_df = anno_df[["project", "module", "top", "labels"]]
anno_df.head()

Unnamed: 0,project,module,top,labels
0,dubbo,dubbo-configcenter,big data,"['big data', 'instant messaging', 'user interf..."
1,dubbo,dubbo-remoting,server,"['server', 'instant messaging', 'web service',..."
2,dubbo,dubbo-spring-boot,microservices,"['microservices', 'web service', 'instant mess..."
3,dubbo,dubbo-serialization,serialization,"['serialization', 'database', 'file system', '..."
4,dubbo,dubbo-native,web server,"['web server', 'instant messaging', 'web servi..."


In [3]:
anno_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   project  483 non-null    object
 1   module   483 non-null    object
 2   top      483 non-null    object
 3   labels   483 non-null    object
dtypes: object(4)
memory usage: 15.2+ KB


## Take a subset of df as test

In [4]:
# skywalking apm-protocol as an example

topics_sub_df = topics_df[topics_df["module"]=="apm-protocol"]
topics_sub_df

Unnamed: 0,project_name,module,code_num_topics,test_num_topics,code_shown_topics,code_topics,test_shown_topics,test_topics,doc_num_topics,doc_shown_topics,doc_topics
17,02_skywalking,apm-protocol,8,1,"[(0, [('command', 0.0175439), ('serializable',...","[['command', 'serializable', 'deserializable',...","[(0, [('command', 0.11111111), ('complete', 0....","[['command', 'serializable', 'deserializable',...",9,"[(0, [('trace', 0.0060752206), ('support', 0.0...","[['trace', 'support', 'metric', 'mail', 'nativ..."


In [5]:
anno_sub_df = anno_df[anno_df["module"]=="apm-protocol"]
anno_sub_df

Unnamed: 0,project,module,top,labels
26,skywalking,apm-protocol,server,"['server', 'plot', 'instant messaging', 'websi..."


In [6]:
# pd.set_option('display.max_colwidth', None)
module_df = topics_sub_df.merge(anno_sub_df).drop(columns=["project"])
module_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   project_name       1 non-null      object
 1   module             1 non-null      object
 2   code_num_topics    1 non-null      int64 
 3   test_num_topics    1 non-null      int64 
 4   code_shown_topics  1 non-null      object
 5   code_topics        1 non-null      object
 6   test_shown_topics  1 non-null      object
 7   test_topics        1 non-null      object
 8   doc_num_topics     1 non-null      int64 
 9   doc_shown_topics   1 non-null      object
 10  doc_topics         1 non-null      object
 11  top                1 non-null      object
 12  labels             1 non-null      object
dtypes: int64(3), object(10)
memory usage: 236.0+ bytes


## LLM Matching of AD to Concepts

### Using embeddings then calculate semantic similarity to match with domains

- StackOverflow w2v
- BERT
- ELMo

### StackOverflow W2V

In [1]:
from gensim.models.keyedvectors import KeyedVectors
from nltk import word_tokenize

so_w2v_model = KeyedVectors.load_word2vec_format('C:/Users/biadge/OneDrive - BP/PhD/extraction/SO_vectors_200.bin', binary=True)

In [2]:
import numpy as np
from scipy import spatial

index2word_set = set(so_w2v_model.index_to_key)

def avg_feature_vector(sentence, model=so_w2v_model, num_features=200, index2word_set=index2word_set):
#     words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in sentence:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

AttributeError: The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [None]:
def sow2v_sim(x, y):
    x_vector = avg_feature_vector(x)
    y_vector = avg_feature_vector(y)
    res = 1 - spatial.distance.cosine(x_vector, y_vector)
    return res

In [None]:
sow2v_sim('king', 'queen')

In [12]:
concept_list = module_df['code_topics'][0]
domains = module_df['labels'][0]

In [13]:
concept_list

"[['command', 'serializable', 'deserializable', 'task', 'number', 'builder', 'profile', 'duration', 'max', 'network'], ['command', 'builder', 'number', 'runtime', 'unsupported', 'path', 'add', 'serializable', 'serialize', 'trace'], ['trigger', 'ebpf', 'fix', 'gson', 'extension', 'process', 'update', 'target', 'task', 'command'], ['task', 'profile', 'command', 'duration', 'time', 'min', 'max', 'endpoint', 'count', 'dump'], ['command', 'discovery', 'uuid', 'number', 'deserializable', 'serializable', 'key', 'value', 'pair', 'deserialize'], ['command', 'serializable', 'deserializable', 'profile', 'number', 'duration', 'builder', 'task', 'unsupported', 'uuid'], ['setting', 'integer', 'network', 'rule', 'max', 'size', 'request', 'require', 'response', 'sample'], ['command', 'serializable', 'deserializable', 'number', 'builder', 'task', 'profile', 'unsupported', 'deserializer', 'max']]"

In [14]:
domains

"['server', 'plot', 'instant messaging', 'website', 'file system', 'web server', 'database', 'command-line interface', 'World Wide Web', 'package management system', 'application performance management', 'client', 'web service', 'File Transfer Protocol', 'shell tool', 'user interface', 'telecommunications network', 'HTTP server', 'computer configuration', 'data binding', 'big data', 'extract, transform, load', 'object detection', 'data', 'security', 'web application', 'regular expression', 'data structure', 'web application security', 'smart contract', 'statistics', 'machine translation', 'social network', 'pattern matching', 'network monitoring', 'microservices', 'network security', 'time series', 'continuous integration', 'analytics', 'automation', 'object–relational mapping', 'HTTP client', 'neural machine translation', 'password manager', 'back end', 'operating system', 'WebSocket', 'embedded system', 'game server', 'font', 'evolutionary algorithm', 'data visualization', 'face dete

### using apm-protocol module as an example

In [15]:
import ast

cl = ast.literal_eval(concept_list)

In [16]:
match_res = []

for con in cl:
    domain_res = llm(system_prompt.format(concepts=con, domains=domains))
    match_res.append([con, domain_res])

Llama.generate: prefix-match hit


["server", "plot", "instant messaging", "website", "file system", "web server", "database", "command line interface", "World Wide Web", "package management system", "application performance management", "client", "web service", "File Transfer Protocol", "shell tool", "user interface", "telecommunications network", "HTTP server", "computer configuration", "data binding", "big data", "extract transform load", "object detection", "data", "security", "web application", "regular expression", "data structure", "web application security", "smart contract", "statistics", "machine translation", "social network", "

KeyboardInterrupt: 

In [None]:
pd.set_option('display.max_colwidth', None)
domain_res_df = pd.DataFrame(match_res, columns=['concept', 'domain'])
domain_res_df['domain'] = domain_res_df['domain'].str.rstrip('\n')
# domain_res_df['domain'] = domain_res_df['domain'].apply(lambda x: ast.literal_eval(x))
domain_res_df

Unnamed: 0,concept,domain
0,"[command, serializable, deserializable, task, number, builder, profile, duration, max, network]","[""server"", ""network"", ""instant messaging""]"
1,"[command, builder, number, runtime, unsupported, path, add, serializable, serialize, trace]","[""server"", ""network"", ""messaging""]"
2,"[trigger, ebpf, fix, gson, extension, process, update, target, task, command]","[""server"", ""web server"", ""website""]"
3,"[task, profile, command, duration, time, min, max, endpoint, count, dump]","[""server"", ""web server"", ""website""]"
4,"[command, discovery, uuid, number, deserializable, serializable, key, value, pair, deserialize]","[""server"", ""web server"", ""website""]"
5,"[command, serializable, deserializable, profile, number, duration, builder, task, unsupported, uuid]","[""server"", ""web server"", ""website""]"
6,"[setting, integer, network, rule, max, size, request, require, response, sample]","[""server"", ""network"", ""messaging""]"
7,"[command, serializable, deserializable, number, builder, task, profile, unsupported, deserializer, max]","[""server"", ""web server"", ""website""]"


In [None]:
# domain_res_top_df = domain_res_df['domain'].apply(lambda x: x[:3] if len(x) > 4 else x)
# domain_res_top_df