# Propagation

This notebook is created for research and testing our propagation algorithm. Install the dependent packages listed in `environment.yml` using conda.

## Define Functions

### Database  methods

In [174]:
import pymongo
from bson.objectid import ObjectId

host = "mongodb://localhost:27017/?directConnection=true"
client = pymongo.MongoClient(host)
db = client.rampvis

In [175]:
def get_vis(vis_id):
    doc = db['onto_vis'].find_one({'_id': ObjectId(vis_id)})
    return doc

In [176]:
def find_example_data_of_vis(vis_id):
    cursor =  db['onto_page'].find({'visId': vis_id})

    data_ids = []
    for doc in cursor:
        data_ids = doc.get('dataIds')
      
    print(data_ids)
    # Find example data with data_ids
    res = db['onto_data'].find(
        {'_id': {'$in': [ObjectId(d) for d in data_ids]}})
      
    data = []
    for d in res:
        id = str(d['_id'])
        del d['_id']
        data.append({**d, 'id': id})

    return data


# find_example_data_of_vis("61c2ae8114fc7399b44f17c4")

### Search methods
Connect to the search engine (Elacticsearch). The search engine indexes the Ontology database, where perform complex search. The search engine also periodically synchronizes the Ontology database for any addition, deletion or update to the Ontology.

In [177]:
from elasticsearch import Elasticsearch

host = ['http://localhost']
es = Elasticsearch(host, port=9200)

if not es.ping():
    raise ValueError("Search engine connection failed")

### Plotting functions

Function to plot matrix as a heatmap.

In [178]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_matrix(matrix, is_dss=True):
    '''
    Input matrix
    '''
    fig, ax = plt.subplots(figsize=(matrix.shape[1] * .8, matrix.shape[0] * .8))
    
    # sns.set_theme()
    
    hm = sns.heatmap(matrix,
                     vmin=0,
                     vmax=1,
                     cmap="Blues",
                     annot=True,
                     fmt=".1g",
                     linewidths=2,
                     ax=ax)
    # hm.set_yticklabels(hm.get_yticks(), size=7)
    
    # For generation Srd
    #x_axis_labels = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12'] # labels for x-axis
    #y_axis_labels = ['R1', 'R2', 'R3', 'R4'] # labels for y-axis
    
    # For generation Sdd
    x_axis_labels = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12'] # labels for x-axis
    y_axis_labels = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12'] # labels for x-axis
    
    ax.set_xticklabels(x_axis_labels)
    ax.set_yticklabels(y_axis_labels)

    
# To test/debug the plot
#plot_matrix(Dt)

### Similarity measurement algorithms

#### Datatype (boolean)

Compute a pair wise similarity between example data streams ($A$) and search results ($B$) based on the data type filed. 
The similarity function is defined as,

\begin{equation}
  T(A_x, B_y) =  
      \begin{cases}
        1, & \text{if data type of both } A_x \text{ and } B_y \text{ are similar} \\
        0, & \text{otherwise}
      \end{cases}
\end{equation}



In [179]:
from sklearn.metrics.pairwise import euclidean_distances

def pairwise_boolean_similarity(X, Y):
    '''
    Compute pairwise boolean (using euclidean distance) between X and Y 
    X & Y are Bag of Words
    '''
    
    sim = 1 - euclidean_distances(X, Y)
    sim = sim.clip(min=0)
    return sim

#### Keyword similarity (percentage of overlap)

Compute a pair wise similarity between example data streams ($A$) and the data streams returned by search results ($B$) based on the keywords filed. The keywords field will contain a subset of keywords and it will not repeat, therefore, we used Jaccard dictance measurement to calculate the similarity, which is defined as,
 
\begin{equation}
K(A_x, B_y) = \frac{( A_x \cup B_y)} {( A_x \cap B_y)}
\end{equation}

In [180]:
from sklearn.metrics import jaccard_score

def pairwise_jaccard_similarty(X, Y):
    '''
    Compute pair-wise jaccard similarity between X and Y.
    X & Y are Bag of Words
    '''
    sim = np.zeros(shape=(len(X), len(Y)))
    i = 0
    j = 0

    for item_x in X:
        # print('item_x: ', item_x)
        for item_y in Y:
            # print('item_y: ', item_y)
            sim[i][j] = jaccard_score(item_x, item_y)
            j += 1

        j = 0
        i += 1
        
    return sim

#### Description (Tf-IDF & Cosine)

Compute a pair wise similarity between example data streams ($A$) and search results ($B$)based on the description filed. The description is expected to be in free form text, therefore, we used cosine dictance measurementto calculate the similarity. The similarity function is defined as,

\begin{equation}
D(A_x, B_y) 
= { A_x  B_y \over \| A_x\| \| B_y\|} 
= \frac{ \sum_{i=1}^{n}{A_i B_i} }{ \sqrt{\sum_{i=1}^{n}{(A_i)^2}} \sqrt{\sum_{i=1}^{n}{(B_i)^2}} }
\end{equation}


In [181]:
from sklearn.metrics.pairwise import cosine_similarity

def pairwise_cosine_similarity(X, Y):
    '''
    Compute pairwise cosine between X and Y 
    X & Y are Bag of Words
    '''
    sim = cosine_similarity(X, Y)
    return sim

#### Weighted average

Aggregate by element wise multiplcation of three similarity matrices computed above- keyword, datatype, and description (boost the ranking)

\begin{equation}
S1(A_x, B_y) = T(A_x, B_y) \times  [\alpha K(A_x, B_y) + \beta D(A_x, B_y)]
\end{equation}

$\alpha$ and $\beta$  are scalar constants that defines the influence of keywords and description fields in the similarty measurement.

Weighted average

In [182]:
def weighted_average1(T, K, D = None, A = None, alpha=0.8, beta=0.1, gamma=0.1):
    '''
    '''
    if D is None:
        D = np.zeros(T.shape)
    if A is None:
        A = np.zeros(T.shape)
        
    return T * (alpha * K + beta * D + gamma * A)

Cluster it is different (TODO: describe)

\begin{equation}
S2(A_x, B_y) = \alpha K(A_x, B_y) + \beta D(A_x, B_y)
\end{equation}

In [183]:
def weighted_average2(K, D = None, A = None, alpha=0.8, beta=0.1, gamma=0.1):
    '''
    '''
    if D is None:
        D = np.zeros(K.shape)
    if A is None:
        A = np.zeros(K.shape)
    
    return alpha * K + beta * D + gamma * A

## Search Datastreams

Given a VIS function and its example data streams in a ordered list. Search for equivalent list of data streams in the ontology that can be visualized using the VIS function. 
The search results are grouped into orderded list and returned. The ontology manager checks if the example list matches the search result and propagates to create new pages. The steps are as follows: 

1. An ontology manager selects a VIS function. 
2. The system finds the example data streams bind the selected VIS function. 
3. Then the ontology manager formulates a search query based on the available keywords and data types. 
4. The search engine retrives a list of data streams matching the search criteria. 
5. A grouping algorithm groups the search results into possible matches.




<!-- 
**Output**

1. Group of data streams ranked
$$
D_1^1, D_2^1, D_3^1, \dots, D_m^1 \\
D_1^2, D_2^2, D_3^2, \dots, D_m^2 \\
\dots \\
D_1^n, D_2^n, D_3^n, \dots, D_m^n \\
$$

1. Group of links
$$
L_1^1, L_2^1, L_3^1, \dots, L_o^1 \\
L_1^2, L_2^2, L_3^2, \dots, L_o^2 \\
\dots \\
L_1^p, L_2^p, L_3^p, \dots, L_o^p \\
$$
 -->
 


### Example data streams 

Find the example data streams of a vis function. For example, VIS function `RegionalOverview`.

In [184]:

vis_id = "61c2ae8114fc7399b44f17c4" # DashboardMSOA
# "61b5e450b10fcfb0e5465851" # DashboardUTLA
# "604b3df699dcc1275fa024d0" # StackedBarChartWith6Places

get_vis(vis_id) 

{'_id': ObjectId('61c2ae8114fc7399b44f17c4'),
 'function': 'DashboardMSOA',
 'type': 'dashboard',
 'description': 'Covid-19 data on an MSOA level (Middle Layer Super Output Area)',
 'dataTypes': ['multi_timeseries']}

Reference

In [185]:
 R = find_example_data_of_vis(vis_id)

['61ebccf3e295021059c84322', '61ebccf4d51ba72c3279e78f']


In [186]:
len(R)

2

In [187]:
print(R)

[{'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=msoa&metric=newCasesBySpecimenDateRollingSum&metric=newCasesBySpecimenDateRollingRate&metric=newCasesBySpecimenDateChangePercentage&format=json&areaCode=E02000961', 'dataType': 'timeseries', 'keywords': 'phe, msoa, group1, e02000961', 'description': '', 'date': datetime.datetime(2022, 1, 22, 9, 22, 59, 488000), 'id': '61ebccf3e295021059c84322'}, {'urlCode': 'API_GOVUK', 'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=msoa&metric=cumVaccinationFirstDoseUptakeByVaccinationDatePercentage&metric=cumVaccinationSecondDoseUptakeByVaccinationDatePercentage&metric=cumVaccinationThirdInjectionUptakeByVaccinationDatePercentage&format=json&areaCode=E02000961', 'dataType': 'timeseries', 'keywords': 'phe, msoa, group2, e02000961', 'description': '', 'date': datetime.datetime(2022, 1, 22, 9, 23, 0, 476000), 'id': '61ebccf4d51ba72c3279e78f'}]


List of example data streams.

### Search data streams

We can formulate and perform complex searches query using AND/OR and filter operation. 

For example, if we want to show overview of all available Scotland regional COVID-19 data (e.g., Cumulative, ICU, Hospital Conformed, and Suspected). The search query must find all Scotland raw data streams, any available regions, and filter data that are time-series and cumulative time-series, that is 

$$
MUST(k_a, k_b, ...) \ \Upsilon \ SHOULD(k_i, k_j, ...) \ \Upsilon \ FILTER(k_p, k_q, ...) \ \Upsilon \ MUST_NOT(k_x, k_y, ...)
$$

- consistent number of (unconstrained) keywords: fixed 2, or as long as same, 
- number of matching (unconstrained) keywords: at least 2

**Stream Matching**

- `Should target` will not contribute to similarity score of example and target streams.
- `Must` can be removed
- `Should example` should boost the similarity.

**Grouping**

- Should target will not contribute to similarity score of example and target streams.
- Must can be removed
- Should example should boost the similarity.

In [188]:
# DashboardUTLA
# filter_keys = ["timeseries"]
# minimum_should_match = 1
# must_keys = ["phe", "utla"]
# must_not_keys = []
# should_keys =  ["group1", "group2"]
# should_keys_stop = ["group1", "group2"]


# DashboardMSOA
filter_keys = ["timeseries"]
minimum_should_match = 1
must_keys = ["phe", "msoa"]
must_not_keys = []
should_keys =  ["group1", "group2"]
should_keys_stop = ["group1", "group2"]


# StackedBarChartWith6Places
# filter_keys = ["timeseries"]
# minimum_should_match = 1
# must_keys = ["ons", "england", "mortality", "weekly", "local_authority", "place_of_death", "covid_deaths", "phong"]
# must_not_keys = ["all_deaths", "adur"]
# should_keys =  ["care_home", "elsewhere", "home", "hospice", "hospital", "other_communal_establishment"]
# should_keys_stop = ["care_home", "elsewhere", "home", "hospice", "hospital", "other_communal_establishment"]

Covert all keywords to lower-case

In [189]:
must_keys = [d.lower() for d in must_keys]
should_keys = [d.lower() for d in should_keys]
filter_keys = [d.lower() for d in filter_keys]
must_not_keys = [d.lower() for d in must_not_keys]

In [190]:
must_clause = [{'match': {'keywords': d}} for d in must_keys]
should_clause = [{'match': {'keywords': d}} for d in should_keys]
filter_clause = [{'term': {'dataType': d}} for d in filter_keys]
must_not_clause = [{'match': {'keywords': d}} for d in must_not_keys]

# TODO must_not_clause = [{'term': {'dataType': d}} for d in filter_keys] 

In [191]:
query = {
    "bool": {
        "must": must_clause,
        "should": should_clause,
        "must_not": must_not_clause,
        "minimum_should_match": minimum_should_match,
        "filter": {
            "bool": {
                "should": filter_clause
            }
        },
     }
}

query

{'bool': {'must': [{'match': {'keywords': 'phe'}},
   {'match': {'keywords': 'msoa'}}],
  'should': [{'match': {'keywords': 'group1'}},
   {'match': {'keywords': 'group2'}}],
  'must_not': [],
  'minimum_should_match': 1,
  'filter': {'bool': {'should': [{'term': {'dataType': 'timeseries'}}]}}}}

In [192]:
res = es.search(index='rampvis.onto_data',
                size=15000,
                body={"query": query})

D = [{**d['_source'],  'id': d['_id']}
                   for d in res["hits"]["hits"]]

Discovered data streams

In [193]:
len(D)

13582

In [194]:
import random
random.shuffle(D)

Filter example data streams if present in the search result. Also, filter the data streams that are already propagated.

In [195]:
D = [d1 for d1 in D if not next((d2 for d2 in R if d2['id'] == d1['id']), None) ]

In [196]:
# data_search
len(D)

13580

# Group and Rank

The grouping algorithms 

1. **Step-1** Create a similarity matrix, $SimMatA$, between each example data streams and search results, 
2. **Step-2** Create a similarity matrix, $SimMatB$, between each search results,
3. **Step-3** Cluster and group the searched data streams, $ClsMatB$ based on the similarity matrix, $SimMatB$, and 
3. **Step-4** Sort each group within the clusters of $ClsMatB$ based on similarity matrix $SimMatA$.

Merge example data streams at the top and searched data streams to a single array/matrix

In [197]:
import numpy as np

# Merge example (reference) data and searched data
RD = np.concatenate((R, D))
rows = len(R)
cols = len(D)

print(f'Example no. of data streams, rows: {rows} \nData streams returned by search result, cols: {cols} ')
print(f'Shape of merged data: {RD.shape}')

Example no. of data streams, rows: 2 
Data streams returned by search result, cols: 13580 
Shape of merged data: (13582,)


### Srd : Compute similarity between reference and discovered

Compute a pair wise similarity matrix between example data strams and search results (data streams). We calculate  similarities between fields (e.g., keywords, description, and data type ) and then aggregate the scores.

#### Datatype

In [198]:
datatype = [d['dataType'] for d in RD]

In [199]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(lowercase=True)
bow_datatype = count_vectorizer.fit_transform(datatype)

Srd_type = pairwise_boolean_similarity(bow_datatype[0: rows], bow_datatype[rows: ])
Srd_type.shape

(2, 13580)

In [157]:
# In fact, cosine distance will also give similar result
# sim_datatype = cosine_similarity(bow_datatype[0: len(data_ex)], bow_datatype[len(data_ex): ])

In [158]:
# datatype
# count_vectorizer.get_feature_names()
# bow_datatype.toarray()

In [None]:
plot_matrix(Srd_type)

Figure 1. Pairwise similarity matrix based on data-type field where, x-axis example data stream, y-axis searched data stream.

#### Keywords

Computer pairwise similarity between example and search results
- `must_keys` - All search reslts will have must keys, therefore it can be removed.
- `should_keys` (TBD)
  - example should keys from  will give stream to stream match, e.g., (Example D1,  Search D1)
  - new should keys should give group/cluster match, e.g., (Search D1, Search D2) 
  

In [200]:
keywords = [d['keywords'] for d in RD]
Srd_keywords =None

In [201]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

# count_vectorizer = CountVectorizer(lowercase=True)
count_vectorizer = CountVectorizer(lowercase=True, stop_words=text.ENGLISH_STOP_WORDS) # .union(must_keys)
bow_keywords = count_vectorizer.fit_transform(keywords)
Srd_keywords = pairwise_jaccard_similarty(bow_keywords[0: rows].toarray(), bow_keywords[rows: ].toarray())

Srd_keywords.shape

(2, 13580)

 Debug

In [64]:
# keywords[0: len(data_ex)]
# keywords[len(data_ex): ]
# bow_keywords.toarray()[0: len(data_ex)]
# bow_keywords.toarray()[len(data_ex): ]
# count_vectorizer.get_feature_names()

In [None]:
plot_matrix(Srd_keywords)

Figure 2. Pairwise similarity matrix based on keywords field where, x-axis example data stream, y-axis searched data stream.

### Description

If description field is there!

In [69]:
description = [d['description'] for d in RD]
Srd_description = None

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bow_description = tfidf_vectorizer.fit_transform(description)

Srd_description = pairwise_cosine_similarity(bow_description[0: rows], bow_description[rows: ])

Srd_description.shape

Debug

In [270]:
# bow_description.toarray()[len(ex_data): ]
# bow_description.toarray()[0: len(ex_data)]
# tfidf_vectorizer.get_feature_names()

In [271]:
# len(tfidf_vectorizer.get_feature_names())

In [272]:
# bow_description.toarray()

In [273]:
# plot_matrix(bow_description.toarray())

In [None]:
plot_matrix(Srd_description)

Figure 3. Pairwise similarity matrix based on description field where, x-axis example data stream, y-axis searched data stream.

#### API Endpoint

In [202]:
endpoints = [d['endpoint'] for d in RD]
Srd_api = None

In [203]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bow_description = tfidf_vectorizer.fit_transform(endpoints)

Srd_api = pairwise_cosine_similarity(bow_description[0: rows], bow_description[rows: ])
Srd_api.shape

(2, 13580)

In [None]:
plot_matrix(Srd_api)

#### Aggregated Srd

In [204]:
alpha = 0.6
beta = 0.2
gamma = 0.2

# Srd = weighted_average1(Srd_type, Srd_keywords, Srd_description, Srd_api, alpha, beta, gamma)
Srd = weighted_average1(Srd_type, Srd_keywords, None, Srd_api, alpha, beta, gamma)
Srd.shape

(2, 13580)

In [None]:
plot_matrix(Srd)

Figure 4. Srd

### Sdd: compute similarity between discovered

#### Datatype (N/A)

Compute a pair wise similarity between each search results ($B$) based on the data type field, using the equation $1$.

We must not include that, becasue a cluster might contain different data types, e.g., timeseris, cum-timesries.
This will make the similarit value to 0.

#### Keywords

Compute a pair wise similarity between each search results ($B$) based on the keyword field, using the equation $2$.

- Should target will not contribute to similarity score of example and target streams.
- Must can be removed
- Should example should boost the similarity.

In [205]:
keywords = [d['keywords'] for d in D]
Sdd_keywords = None

In [206]:
from sklearn.feature_extraction import text


# count_vectorizer = CountVectorizer(lowercase=True, stop_words=['scotland', 'raw'])
count_vectorizer = CountVectorizer(lowercase=True, stop_words=text.ENGLISH_STOP_WORDS.union(must_keys)) # 
bow = count_vectorizer.fit_transform(keywords)

#K = pairwise_jaccard_similarty(bow.toarray(), bow.toarray())
Sdd_keywords = pairwise_cosine_similarity(bow.toarray(), bow.toarray())
Sdd_keywords.shape

(13580, 13580)

In [None]:
# count_vectorizer.get_feature_names()
# bow_keywords.toarray()
plot_matrix(Sdd_keywords)

### Description
If the description field exists!


Compute a pair wise similarity between each search results ($B$) based on the description, using the equation $3$.

In [128]:
description = [d['description'] for d in D]
Sdd_description = None

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bow = tfidf_vectorizer.fit_transform(description)

Sdd_description = pairwise_cosine_similarity(bow, bow)
Sdd_description.shape

In [None]:
# tfidf_vectorizer.get_feature_names()
plot_matrix(Sdd_description)

#### API endpoint

Compute a pair wise similarity between each search results ($A$) based on the endpoint, using the equation $3$.

In [207]:
endpoint = [d['endpoint'] for d in D]
Sdd_api = None

In [208]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bow = tfidf_vectorizer.fit_transform(endpoint)

Sdd_api = pairwise_cosine_similarity(bow, bow)
Sdd_api.shape

(13580, 13580)

In [None]:
# tfidf_vectorizer.get_feature_names()
plot_matrix(Sdd_api)

### Aggregated Sdd

In [209]:
alpha = 0.8
beta = 0.1
gamma = 0.1

# Sdd = weighted_average2(Sdd_keywords, Sdd_description, Sdd_api, alpha, beta, gamma)
Sdd = weighted_average2(Sdd_keywords, None, Sdd_api, alpha, beta, gamma)
Sdd.shape

(13580, 13580)

In [None]:
#np.fill_diagonal(M2, 0) # it will create a negative value when we shall invert the matrix
plot_matrix(Sdd)

Figure 5. Pairwise aggregated similarity matrix between searched data stream.

### Clustering Sdd

#### Spectral Clustering (CPU)

In [210]:
n_clusters = int(len(D) / len(R))

print(f'Size of cluster = {len(R)}')
print(f'Number of clusters = {n_clusters}')

Size of cluster = 2
Number of clusters = 6790


In [None]:
from sklearn.cluster import SpectralClustering

# clustering = SpectralClustering(n_clusters=n_clusters, assign_labels="discretize", random_state=0).fit(sim_agg)
clustering = SpectralClustering(n_clusters=n_clusters).fit(Sdd)

clustering.labels_.shape

#### K-Means Clustering (GPU)

In [211]:
# GPU
from cuml.cluster import KMeans
import cudf
import pandas as pd

def np2cudf(df):
    # convert numpy array to cuDF dataframe
    df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})
    pdf = cudf.DataFrame()
    for c,column in enumerate(df):
      pdf[str(c)] = df[column]
    return pdf


Sdd_pdf = np2cudf(Sdd)
kmeans_float = KMeans(n_clusters=n_clusters).fit(Sdd_pdf)

MemoryError: std::bad_alloc: CUDA error at: /home/saifulkhan/anaconda3/envs/cuml/include/rmm/mr/device/cuda_memory_resource.hpp

In [96]:
clustering.labels_.shape

(432,)

**Verify the clusters**

In [90]:
def group_data_streams(S, labels):
    '''
    '''
    group_dict = dict()
    
    for i, d in enumerate(S): 
        # column
        vec = Srd[:,i]
        # The indices of the maximum values along an axis.
        idx = np.argmax(vec, axis=None, out=None) 
        score = vec[idx]
        
        # print(vec, idx, score)
        
        try:
            group_dict.get(labels[i]).append({ **d, 'score': score, 'idx': idx})
        except:
            group_dict.setdefault(labels[i], [{ **d, 'score': round(score, 3), 'idx': idx}])
    
    groups = []
    for k in group_dict:
        group = group_dict[k]
        group = sorted(group, key=lambda k: k['idx']) 
        group_score = sum(d['score'] for d in group)
        groups.append({ 'score': round(group_score, 3), 'group': group })
    
    groups.sort(key=lambda x: x['score'], reverse=True)

    return groups

Print some relevant meta-data of the clusters for manual verification and debugging.

In [91]:
group_data_streams(D, clustering.labels_)

[{'score': 30.028,
  'group': [{'dataType': 'timeseries',
    'date': '2022-01-22T11:06:12.162Z',
    'description': '',
    'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=newCasesBySpecimenDate&metric=newDeaths28DaysByDeathDate&metric=newPeopleVaccinatedFirstDoseByVaccinationDate&metric=newPeopleVaccinatedSecondDoseByVaccinationDate&metric=cumVaccinationThirdInjectionUptakeByVaccinationDatePercentage&format=json&areaCode=W06000013',
    'keywords': 'phe, utla, group1, w06000013',
    'urlCode': 'API_GOVUK',
    'id': '61ebe52424ff9587e26de82e',
    'score': 0.19975189818254194,
    'idx': 0},
   {'dataType': 'timeseries',
    'date': '2022-01-22T11:06:20.974Z',
    'description': '',
    'endpoint': 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=newCasesBySpecimenDate&metric=newDeaths28DaysByDeathDate&metric=newPeopleVaccinatedFirstDoseByVaccinationDate&metric=newPeopleVaccinatedSecondDoseByVaccinationDate&metric=cumVaccinationThirdInj

### Grouping and Ranking

In [133]:
def brute_force_group(Dss, m):
    n = Dss.shape[1]

    visited = [False] *  n
    labels = [None] * n
    group_counter = 0

    for i, s in enumerate(Dss):
        if visited[i] == True:
            continue
        # Indices of m maximum values
        idx = Dss[i].argsort()[-m:][::-1]
        #print(idx_list)
        for j, d in enumerate(idx):
            # print(j)
            visited[d] = True
            labels[d] = group_counter
            visited

        group_counter += 1
    
    return labels

In [None]:
def brute_force_rank(Des, labels, S):
    m = Des.shape[0]
    n = Des.shape[1]

    group_map = dict()

    for i in range(n): 
        # column vector
        col_vec = Des[:,i]
        # print(vec)
        # The indices of the maximum values along an axis.
        
        # index of example data stream that has a clesest match
        idx_max  = np.argmax(col_vec, axis=None, out=None)
        # degree of closeness
        score = col_vec[idx_max ]
        
        #print(row, score)
        
        group_object = group_map.get(labels[i])
        if group_object is not None:
            # print(group_object)
            score = (group_object['score'] + score) / 2
            group = group_object['group']
            group[idx_max ] = S[i]
            # group_object.append({ 'score': score, 'idx_e': idx_e, **S[col]})
            
        else:
            # s = S[row][col] # Note: in the math or pseudocode S is a matrix of features
            #s = S[i]        # Note: implementation S is vector of feature object
            group = [None] * m
            group[idx_max] = S[i]
            group_object = { 'score': score, 'group': group}
            group_map.setdefault(labels[i], group_object)
            
    
    for k in group_map:
        print('\n')
        for l in group_map[k]['group']:
            print(l)
    


labels = brute_force_group(Dss, Des.shape[0])
labels
brute_force_rank(Des, labels, S)

In [135]:
def group(Dss, m):
    n = Dss.shape[0]
    visited = [False] *  n
    G = []
    
    # Iterate Dss
    for i in range(n):
        
        if visited[i] == True:
            continue
        
        # indices of m maximum values; which is a cluster
        max_m_idx = Dss[i].argsort()[-m:][::-1] # unsorted 
        print('i = ', i, '\n', Dss[i])
        G.append(max_m_idx)
        print(max_m_idx)
        
        for j in range(m):
            idx = max_m_idx[j]
            visited[idx] = True
     
    return G


G = group(Dss, Des.shape[0])
G

i =  0 
 [1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.]
[11  8  1  0]
i =  2 
 [0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0.]
[7 6 3 2]
i =  4 
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0.]
[10  9  5  4]


[array([11,  8,  1,  0]), array([7, 6, 3, 2]), array([10,  9,  5,  4])]

In [None]:
def rank(G, Des, S):
    '''
     G: index of grouped
    '''      
    
    # total groups & group size
    l = len(G)     
    m = len(G[0])
    
    
    # sort each cluster; using Dse
    G_ranked = []
    scores = []
    
    for i in range(l):
        group = G[i]
        #print('group = ', group)
        
        group_ranked = [None] * m
        total_score = 0
                
        # iterate each group
        for j in range(m):
            # index of column of Des
            col_idx = group[j]           
            col_vec = Des[:,col_idx]
            
            #print('col_idx = ', col_idx, 'col_vec = ', col_vec)
            # row index of max score             
            row_idx  = np.argmax(col_vec, axis=None, out=None)
            score = col_vec[row_idx]
            
            total_score = total_score + score
            
            group_ranked[row_idx] = S[col_idx]
            #print('row_idx = ', row_idx, '\n')
            
        scores.append(total_score)
        G_ranked.append(group_ranked)
      
    print(scores)
    return G_ranked
            
R = rank(G, Des, S)
R

In [None]:
labels = brute_force_group(Dss, Des.shape[0])
labels

In [None]:
group_data_streams(S, labels)

#### Other Experiments

In [74]:
#
# K-Means custering
#

from sklearn.cluster import KMeans

clustering = KMeans(n_clusters=n_clusters).fit(Dss)
print(clustering.labels_)

[0 2 2 1 0 2 1 1 0 0 1 2]


In [None]:
group_data_streams(searched, clustering.labels_)

In [None]:
#
# Agglomerative custering
#

from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(affinity='precomputed', n_clusters=n_clusters, linkage='complete').fit(S)
print(clustering.labels_)

In [None]:
#
# LSA
#

from sklearn.decomposition import TruncatedSVD
import pandas as pd

keywords = [d['keywords'] for d in data_search]
bow_keywords = count_vectorizer.fit_transform(keywords)

count_vectorizer.get_feature_names()

bow_keywords.toarray()

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(bow_keywords)


pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2", "topic_3", "topic_4"])

print(topic_encoded_df)