# MoEBERT code

In [1662]:
#!g1.1
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [2016]:
#!g1.1
#import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import pandas as pd
#from keras_preprocessing import sequence
#from keras_preprocessing.text import Tokenizer

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Layer, Input

from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers.models.bert.modeling_tf_bert import TFBertLayer
from transformers import BertConfig

In [2017]:
#!g1.1
encoder = TFBertModel.from_pretrained("model", from_pt=True)
tokenizer = BertTokenizer.from_pretrained('model')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [2018]:
#!g1.1
from tensorflow.keras.layers import MultiHeadAttention, Dropout, LayerNormalization, Dense
from tensorflow.keras.models import Sequential

class BertLayer(Layer):
    def __init__(self, num_heads, embeddings_dim, ff_dim, i):
        super(BertLayer, self, ).__init__()
        self.num_heads = num_heads
        self.key_dim = embeddings_dim // num_heads
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=self.key_dim,  name="encoder_{}/multiheadattention".format(i))
        self.dp1 = Dropout(0.1, name="encoder_{}/att_dropout".format(i))
        self.ln1 = LayerNormalization(epsilon=1e-12, name="encoder_{}/att_layernormalization".format(i))
        self.ffn = Sequential([Dense(ff_dim, activation='gelu'),
                               Dense(embeddings_dim),],
                              name="encoder_{}/ffn".format(i))
        self.dp2 = Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))
        self.ln2 = LayerNormalization(epsilon=1e-12, name="encoder_{}/ffn_layernormalization".format(i))
        

    def call(self, query, key, value, attention_mask):
        output1 = self.att(query, key, value, attention_mask=attention_mask)
        output1 = self.dp1(output1)
        output1 = self.ln1(query + output1)

        output2 = self.ffn(output1)
        output2 = self.dp2(output2)
        result = self.ln2(output1 + output2)
        return result

    def set_pretrained(self, layer) :
        att_ws = layer.attention.self_attention.weights + layer.attention.dense_output.dense.weights
        for i in range(len(att_ws)) :
            att_ws[i] = tf.reshape(att_ws[i], self.att.weights[i].shape)
        self.att.set_weights(att_ws)
        self.ln1.set_weights(layer.attention.dense_output.LayerNorm.weights)
        self.ffn.layers[0].set_weights(layer.intermediate.weights)
        self.ffn.layers[1].set_weights(layer.bert_output.dense.weights)
        self.ln2.set_weights(layer.bert_output.LayerNorm.weights)

In [2019]:
#!g1.1
from tensorflow.keras.layers import Add

class SatLayer(Layer) :
    def __init__(self, num_topics, num_heads, embeddings_dim, ff_dim):
        super(SatLayer, self).__init__()
        self.bertLayers = []
        self.addLayer = Add()
        self.num_topics = num_topics
        for i in range(num_topics) :
            self.bertLayers.append(BertLayer(num_heads, embeddings_dim, ff_dim, i + 12))

    def call(self, inputs, attention_mask, ws):
        outputs = []
        
        sat_res = ws
        for i in range(self.num_topics) :
            outputs.append(self.bertLayers[i](inputs, inputs, inputs, attention_mask) * tf.reshape(ws[:, i], (-1, 1, 1)))

        return self.addLayer(outputs), sat_res

    def set_pretrained(self, layer) :
        #pass
        self.bertLayers[0].set_pretrained(layer)
        for curLayer in self.bertLayers :
            curLayer.set_pretrained(layer)


class TopicBertBase(Model):
    def __init__(self, metaModel, num_topics, topic_pos, num_layers, num_heads, embeddings_dim, ff_dim, max_len, vocab_size, use_sat=True):
        super(TopicBertBase, self).__init__()
        self.num_layers = num_layers
        self.topic_pos = topic_pos
        self.max_len = max_len
        self.use_sat=use_sat

        self.metaModel = metaModel

        self.pos_emb = Embedding(max_len, embeddings_dim, name="position_embedding", trainable=False)
        self.word_emb = Embedding(vocab_size, embeddings_dim, name="word_embedding", trainable=False)
        self.seg_emb = Embedding(2, embeddings_dim, name="segment_embedding", trainable=False)
        self.dp = Dropout(0.1, name="encoder_emb_dropout", trainable=False)
        self.ln = LayerNormalization(epsilon=1e-12, name="encoder_emb_layernormalization", trainable=False)
        self.bertLayers = []
        self.satLayer = SatLayer(num_topics, num_heads, embeddings_dim, ff_dim)
        if self.use_sat == False :
            for i in range(num_layers) :
                self.bertLayers.append(BertLayer(num_heads, embeddings_dim, ff_dim, i))
        else :
            for i in range(num_layers - 1) :
                self.bertLayers.append(BertLayer(num_heads, embeddings_dim, ff_dim, i))
                self.bertLayers[-1].trainable = False

    def call(self, inputs):
        outputs = self.word_emb(inputs['input_ids']) + self.seg_emb(inputs['token_type_ids']) + self.pos_emb(tf.range(start=0, limit=self.max_len, delta=1))
        outputs *= tf.expand_dims(inputs['attention_mask'], -1)
        attention_mask = tf.matmul(tf.expand_dims(inputs['attention_mask'], -1), tf.expand_dims(inputs['attention_mask'], -2))

        outputs = self.dp(outputs)
        outputs = self.ln(outputs)
        if self.use_sat == False :
            for i in range(len(self.bertLayers)) :
                  outputs = self.bertLayers[i](outputs, outputs, outputs, attention_mask)
        else :
            for i in range(self.topic_pos) :
                outputs = self.bertLayers[i](outputs, outputs, outputs, attention_mask)

            outputs, sat_res = self.satLayer(outputs, attention_mask, self.metaModel(inputs['meta_info']))

            for i in range(self.topic_pos, self.num_layers - 1) :
                outputs = self.bertLayers[i](outputs, outputs, outputs, attention_mask)

        return outputs * tf.expand_dims(inputs['attention_mask'], -1), sat_res

    def set_pretrained(self, model) :
        self.pos_emb.set_weights([model.bert.embeddings.position_embeddings])
        self.word_emb.set_weights([model.bert.embeddings.weight])
        self.seg_emb.set_weights([model.bert.embeddings.token_type_embeddings])
        self.ln.set_weights(model.bert.embeddings.LayerNorm.weights)
        
        if self.use_sat == False :
            for i in range(len(self.bertLayers)) :
                self.bertLayers[i].set_pretrained(model.bert.encoder.layer.layers[i])
        else :
            for i in range(self.topic_pos) :
                self.bertLayers[i].set_pretrained(model.bert.encoder.layer.layers[i])

            self.satLayer.set_pretrained(model.bert.encoder.layer.layers[self.topic_pos])

            for i in range(self.topic_pos, self.num_layers - 1) :
                self.bertLayers[i].set_pretrained(model.bert.encoder.layer.layers[i + 1])


In [2020]:
#!g1.1
class GraphMoEBERT(Model):
    def __init__(self, num_topics, topic_pos, num_layers, num_heads, embeddings_dim, ff_dim, max_len, vocab_size, use_sat=True):
        super(GraphMoEBERT, self).__init__()
        self.graphNN = Sequential()
        self.graphNN.add(Dense(64, activation="gelu"))
        self.graphNN.add(Dense(num_topics, activation="softmax"))
        self.MoEBERT = TopicBertBase(self.graphNN, num_topics, topic_pos, num_layers, num_heads, embeddings_dim, ff_dim, max_len, vocab_size, use_sat)

    def call(self, inputs):
        return self.MoEBERT(inputs)

    def set_pretrained(self, model) :
        self.MoEBERT.set_pretrained(model)

In [2021]:
#!g1.1
class MLModel(Model) :
    def __init__(self, curModel, vocab_size, is_h=False):
        super(MLModel, self).__init__()
        self.curModel = curModel
        self.is_h=is_h
        #self.denseOutput = Sequential()
        #self.denseOutput.add(Dense(256, activation='gelu'))
        #self.denseOutput.add(LayerNormalization(epsilon=1e-12))
        self.denseOutput = Dense(vocab_size, activation="softmax")
            
        #self.di = Dense(768, activation="gelu")

        pad_val = np.zeros((1, 1, vocab_size))
        pad_val[:, :, 0] = 1
        self.pad_tensor = tf.constant(pad_val, dtype=tf.float32)
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")

    def call(self, inputs):
        outputs = None
        sat_res = None
        if self.is_h :
            outputs = self.denseOutput(self.curModel(inputs)['last_hidden_state'])
        else :
            outputs, sat_res = self.curModel(inputs)
            outputs = self.denseOutput(outputs)
        pads = tf.repeat(tf.repeat(self.pad_tensor, outputs.shape[1], 1), outputs.shape[0], 0)

        outputs = outputs + pads * (1 - tf.expand_dims(inputs['attention_mask'], -1))
        if self.is_h == True :
            return outputs
        
        return outputs, sat_res

    def train_step(self, data) :
        x, y = data
        sample_weights = x.pop("sample_weights")

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss_sat = 0
            batch_size = sample_weights.shape[0]
            if self.is_h == False :
              y_pred, sat_res = y_pred

              for i in range(batch_size) :
                  for j in range(i + 1, batch_size) :
                      loss_sat += tf.reduce_sum(tf.math.abs(sat_res[i] - sat_res[j]))

            
            loss_mlm = self.compiled_loss(y, y_pred, sample_weight=sample_weights)
            loss = tf.reduce_sum(loss_mlm) - loss_sat / 28

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        self.loss_tracker.update_state(loss_mlm, sample_weight=sample_weights)
        # Return a dict mapping metric names to current value
        return {"loss_mlm": self.loss_tracker.result(), "loss_sat": loss_sat}

    def predict(self, data) :
        x = data

        y_pred = self(x, training=False)
        if self.is_h == False :
            y_pred, sat_res = y_pred

        return y_pred

    @property
    def metrics(self):
        return [self.loss_tracker]


    def set_pretrained(self, model) :
        self.curModel.set_pretrained(model)

    def set_trainable_stat(self, flag) :
        self.curModel.trainable = flag

In [2022]:
#!g1.1
class BinaryClassificationTaskModel(Model) :
    def __init__(self, curModel, is_h=False):
        super(BinaryClassificationTaskModel, self).__init__()
        self.curModel = curModel
        self.is_h=is_h
        self.denseOutput = Sequential()
        #self.denseOutput.add(Dropout(0.1))
        self.denseOutput.add(Dense(1, activation="sigmoid"))


    def call(self, inputs):

        outputs = None
        if self.is_h :
            outputs = self.curModel(inputs)['last_hidden_state'][:, 0, :]
        else :
            outputs, sat_res = self.curModel(inputs)
            outputs = outputs[:, 0, :]
        #outputs *= tf.expand_dims(inputs["attention_mask"], -1)        
        #outputs = tf.math.reduce_sum(outputs, 1) / tf.reduce_sum(inputs["attention_mask"], -1, True)
        outputs = self.denseOutput(outputs)
        return outputs

    def set_pretrained(self, model) :
        self.curModel.set_pretrained(model)

    def set_trainable_stat(self, flag) :
        self.curModel.trainable = flag

In [None]:
class ClassificationTaskModel(Model) :
    def __init__(self, curModel, num_classes, is_h=False):
        super(ClassificationTaskModel, self).__init__()
        self.curModel = curModel
        self.is_h=is_h
        self.denseOutput = Dense(num_classes, activation="softmax")

    def call(self, inputs):

        outputs = None
        if self.is_h :
            outputs = self.denseOutput(self.curModel(inputs)['last_hidden_state'][:, 0])
        else :
            outputs = self.denseOutput(self.curModel(inputs)[:, 0])
        return outputs

    def set_pretrained(self, model) :
        self.curModel.set_pretrained(model)

    def set_trainable_stat(self, flag) :
        self.curModel.trainable = flag

In [None]:
class GraphAttention(Layer):
    def __init__(
        self,
        units,
        kernel_initializer="glorot_uniform",
        kernel_regularizer=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.units = units
        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)

    def build(self, input_shape):

        self.kernel = self.add_weight(
            shape=(input_shape[0][-1], self.units),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            name="kernel",
        )
        self.kernel_attention = self.add_weight(
            shape=(self.units * 2, 1),
            trainable=True,
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            name="kernel_attention",
        )
        self.built = True

    def call(self, inputs):
        node_states, edges = inputs

        # Linearly transform node states
        node_states_transformed = tf.matmul(node_states, self.kernel)

        # (1) Compute pair-wise attention scores
        node_states_expanded = tf.gather(node_states_transformed, edges)
        node_states_expanded = tf.reshape(
            node_states_expanded, (tf.shape(edges)[0], -1)
        )
        attention_scores = tf.nn.leaky_relu(
            tf.matmul(node_states_expanded, self.kernel_attention)
        )
        attention_scores = tf.squeeze(attention_scores, -1)

        # (2) Normalize attention scores
        attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
        attention_scores_sum = tf.math.unsorted_segment_sum(
            data=attention_scores,
            segment_ids=edges[:, 0],
            num_segments=tf.reduce_max(edges[:, 0]) + 1,
        )
        attention_scores_sum = tf.repeat(
            attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
        )
        attention_scores_norm = attention_scores / attention_scores_sum

        # (3) Gather node states of neighbors, apply attention scores and aggregate
        node_states_neighbors = tf.gather(node_states_transformed, edges[:, 1])
        out = tf.math.unsorted_segment_sum(
            data=node_states_neighbors * attention_scores_norm[:, tf.newaxis],
            segment_ids=edges[:, 0],
            num_segments=tf.shape(node_states)[0],
        )
        return out


class MultiHeadGraphAttention(Layer):
    def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.merge_type = merge_type
        self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]

    def call(self, inputs):
        atom_features, pair_indices = inputs

        # Obtain outputs from each attention head
        outputs = [
            attention_layer([atom_features, pair_indices])
            for attention_layer in self.attention_layers
        ]
        # Concatenate or average the node states from each head
        if self.merge_type == "concat":
            outputs = tf.concat(outputs, axis=-1)
        else:
            outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
        # Activate and return node states
        return tf.nn.relu(outputs)

In [None]:
class GraphAttentionNetwork(Model):
    def __init__(
        self,
        node_features,
        edges,
        hidden_units,
        num_heads,
        num_layers,
        output_dim,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.node_features = node_features
        self.edges = edges
        self.preprocess = Dense(hidden_units * num_heads, activation="relu")
        self.attention_layers = [
            MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers)
        ]
        self.output_layer = Dense(output_dim, activation="softmax")

    def call(self, inds):
        x = self.preprocess(self.node_features)
        for attention_layer in self.attention_layers:
            x = attention_layer([x, self.edges]) + x
        new_fs = self.output_layer(x)
        outputs = [new_fs[i:i + 1] for i in inds]
        return tf.concat(outputs, 0)

In [None]:
class GATMoEBERT(Model):
    def __init__(self, num_topics, topic_pos, num_layers, num_heads, embeddings_dim, ff_dim, max_len, vocab_size, node_features, edges, use_sat=True):
        super(GATMoEBERT, self).__init__()
        self.graphNN = Sequential
        self.graphNN.add(GraphAttentionNetwork(node_features, edges, 100, 8, 3, num_topics))
        self.graphNN.add(Dense(128, activation="gelu"))
        self.graphNN.add(Dense(num_topics, activation="softmax"))
        self.MoEBERT = TopicBertBase(self.graphNN, num_topics, topic_pos, num_layers, num_heads, embeddings_dim, ff_dim, max_len, vocab_size, use_sat)

    def call(self, inputs):
        return self.MoEBERT(inputs)

    def set_pretrained(self, model) :
        self.MoEBERT.set_pretrained(model)

# ArXiv data extraction

In [808]:
#!g1.1
import json

In [201]:
#!g1.1
%pip install gdown --upgrade

Defaulting to user installation because normal site-packages is not writeable
^C
[31mERROR: Operation cancelled by user[0m


In [196]:
#!g1.1
!gdown https://drive.google.com/file/d/1BOi_w-5PWewCzZQdXxeBFKPrlTVB-D72/view?usp=sharing --fuzzy

/bin/sh: 10: gdown: not found


Exception: Process exited with code 127

In [200]:
#!g1.1
import tensorflow

In [199]:
#!g1.1
from cloud_ml.storage.api import Storage

# To retrieve application id and secret:
# 1. Go to link: https://oauth.yandex.ru/client/new
# 2. Choose 'Web services'
# 3. Paste into 'Callback URI': https://oauth.yandex.ru/verification_code
# 4. Set up permissions on yandex disk
disk = Storage.ya_disk(application_id='0f1d4d16f51b4bc7a19cb0a2401358f3', application_secret='81cf13ba65734c6da1ab16eb46abba19')

# downloading contents of the remote file into the local one
disk.get('path/to/file/within/ya/disk/file.txt', 'path/to/file.txt')

Open the following url to obtain confirmation code: https://oauth.yandex.ru/authorize?response_type=code&client_id=0f1d4d16f51b4bc7a19cb0a2401358f3&display=popup&force_confirm=yes
Enter the confirmation code:  


KeyboardInterrupt: Interrupted by user

In [None]:
!gdown https://drive.google.com/file/d/15md54ItLBygX6knRyG04Kq0tct182Q0v/view?usp=sharing --fuzzy

In [None]:
!gdown https://drive.google.com/file/d/1AJ-8Uiep6j54SQ__hBVd4fT0AqyOE5BH/view?usp=sharing --fuzzy

In [None]:
!gdown https://drive.google.com/file/d/1vIR3uXuIBnlyFDHFEKQnAlLQ9dY-MqXh/view?usp=sharing --fuzzy

In [None]:
!gdown https://drive.google.com/uc?id=1Vxn7HsQQ4s1Faby2J4sX1inUoEJXRdjt --fuzzy

In [None]:
!unzip /content/archive.zip

In [None]:
data = []
with open("arxiv-metadata-oai-snapshot.json", "r") as f :
    data = f.read()

In [None]:
data2 = data.split("\n")[:-1]

In [None]:
work = json.loads(data2[0])

In [None]:
work["versions"][0]['created'].split()[3]

In [None]:
cats = {}
for i in range(len(data2)) :
    work = json.loads(data2[i])
    if int(work["versions"][0]['created'].split()[3]) < 2001 :
        continue
    if work["categories"] in cats :
        cats[work["categories"]].append(i)
    else :
        cats[work["categories"]] = [i]

In [None]:
szs = [len(x) for x in list(cats.values())]

In [None]:
sorted(szs)[::-1][535]

In [None]:
relvs = []
szs = []
for key, value in cats.items() :
    if len(value) >= 100 :
        relvs.append(key)
        szs.append(value)

In [None]:
import networkx as nx

In [None]:
G = nx.Graph()

In [None]:
relvsS = relvs

In [None]:
G.add_nodes_from(range(len(relvsS)))

In [None]:
import tqdm

In [None]:
subsS = []
stoidx = dict()
for i in tqdm.tqdm(relvsS) :
    subsS.append(set())
    for j in cats[i] :
        work = json.loads(data2[j])
        for author in work["authors_parsed"] :
            fullName = author[0] + " " + author[1] + " " + author[2]
            if not (fullName in stoidx) :
                stoidx[fullName] = len(stoidx)
            subsS[-1].add(stoidx[fullName])

In [None]:
adjMatr = np.zeros((len(relvsS), len(relvsS)))
def count_jakkard(v, u, cats, data2) :
    return float(len(subsS[v] & subsS[u])) / float(len(subsS[v] | subsS[u]))
    

    auth1 = set()
    auth2 = set()
    for i in cats[v] :
        work = json.loads(data2[i])
        for author in work["authors_parsed"] :
            auth1.add(author[0] + " " + author[1] + " " + author[2])
    
    for i in cats[u] :
        work = json.loads(data2[i])
        for author in work["authors_parsed"] :
            auth2.add(author[0] + " " + author[1] + " " + author[2])
    
    return float(len(auth1 & auth2)) / float(len(auth1 | auth2))

for i in tqdm.tqdm(range(len(relvsS))) :
    v = relvsS[i]
    for j in range(i + 1, len(relvsS)) :
        u = relvsS[j]
        res = count_jakkard(i, j, cats, data2)
        if res > 0.01 :
            G.add_edge(i, j)
        adjMatr[i][j] = res

In [None]:
import json
subrMatr = json.loads(open("subr_matr.json", "r").read())

In [None]:
import networkx as nx
G = nx.Graph()
G.add_nodes_from(range(len(subrMatr)))

In [None]:
val

In [None]:
import tqdm
THRESHOLD = 0.01

pbar = tqdm.tqdm(total=len(subrMatr)**2)
for i in range(len(subrMatr)) :
    for j in range(i + 1, len(subrMatr)) :
        if float(subrMatr[i][j]) > 0.02 :
            G.add_edge(i, j)
        
        pbar.update(1)
pbar.close()

In [None]:
cnt = []
for i in tqdm.tqdm(range(50)) :
    THRESHOLD = 0.001 + 0.001 * i
    G = nx.Graph()
    G.add_nodes_from(range(len(subrMatr)))
    for i in range(len(subrMatr)) :
        for j in range(i + 1, len(subrMatr)) :
            if float(subrMatr[i][j]) > THRESHOLD :
                G.add_edge(i, j)
    
    cnt.append(nx.number_connected_components(G))

In [None]:
trsh = [0.001 + 0.001 * i for i in range(50)]

In [None]:
plt.figure(figsize=(10, 10))

plt.plot(trsh, cnt)
plt.ylabel("Components")
plt.xlabel("threshold")

In [None]:
nx.write_adjlist(G, path="graph.adjlist")

In [None]:
len(subrMatr)

In [None]:
len(G.edges)

In [None]:
nx.number_connected_components(G)

In [None]:
nx.write_graphml(G, "graph2.graphml")

In [None]:
!git clone https://github.com/phanein/deepwalk

In [None]:
!pip install -r /content/deepwalk/requirements.txt

In [None]:
%cd deepwalk

In [None]:
!python3 ./setup.py install

In [None]:
!deepwalk --input ../graph.adjlist --number-walks 80 --walk-length 80 --window-size 10 --output ../reddit_0.02.embeddings

In [None]:
embData = adjMatr

In [None]:
embData = []
with open("/content/reddit_0.02.embeddings", "r") as f :
    n, m = f.readline().split()
    n = int(n)
    m = int(m)
    embData = np.zeros((n, m))
    for i in range(n) :
        nums = f.readline().split()
        v = int(nums[0])
        for j in range(m) :
            embData[v][j] = float(nums[1 + j])

In [None]:
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [None]:
import random

In [None]:
def extract_arxiv_data(data2, relvsS, embData, cats) :
    X = []
    metaX = []
    y = []
    print("Extracting data")
    for i in tqdm.tqdm(range(len(relvsS))) :
        for workInd in cats[relvsS[i]] :
            work = json.loads(data2[workInd])
            X.append(work["abstract"])
            metaX.append(embData[i])
    
    zipped = list(zip(X, metaX))
    random.shuffle(zipped)
    X, metaX = zip(*zipped)

    return X, metaX

In [None]:
subrs = json.loads(open("proposed_subrs.json", "r").read())

In [None]:
data2 = pd.read_csv("reddit_dump_tiny.csv")

In [None]:
def extract_reddit_data(data2, subrs, embData) :
    subr_dict = dict()
    for i in range(len(subrs)) :
        subr_dict[subrs[i]] = i
    print("Preprocessing reddit data")
    X = data2["body"]
    metaX = data2["subreddit_id"].map(lambda x: embData[subr_dict[x]])

    return X, metaX

In [None]:
def prepare_arxiv_data(X, metaX, max_len=512) :
    encoded_input = tokenizer(X, padding=True, pad_to_multiple_of=max_len, return_tensors="tf", truncation=True, max_length=max_len)
    encoded_input["attention_mask"] = tf.cast(encoded_input["attention_mask"], dtype="float32")
    encoded_input["input_ids"] = encoded_input["input_ids"].numpy()
    encoded_input["meta_info"] = np.zeros((len(X), len(metaX[0])))
    y = []
    y = np.zeros((len(X), max_len))
    print("Post-processing")
    for i in tqdm.tqdm(range(len(X))) :
        encoded_input["meta_info"][i] = metaX[i]
        y[i] = encoded_input["input_ids"][i]
    
    return encoded_input, y

In [None]:
def encode_input(input_str, meta_info = np.zeros((1, 64)), do_mask=True) :
    encoded_input = tokenizer([input_str], padding=True, pad_to_multiple_of=512, return_tensors="tf", truncation=True, max_length=512)
    encoded_input["attention_mask"] = tf.cast(encoded_input["attention_mask"], dtype="float32")
    encoded_input["input_ids"] = encoded_input["input_ids"].numpy()
    encoded_input["meta_info"] = meta_info
    if do_mask == True :
        encoded_input["input_ids"][:, int(tf.reduce_sum(encoded_input.attention_mask)) - 1:] = tokenizer.mask_token_id
        encoded_input.attention_mask = tf.ones_like(encoded_input.attention_mask)

    return encoded_input.data

In [None]:
Xdata, metaX = extract_arxiv_data(data2, relvsS, embData, cats)

In [None]:
l = 1000000
r = 1000100
X, y = prepare_arxiv_data(Xdata[l:r], metaX[l:r])

In [None]:
l = 0
r = 100000
X_train = {}
y_train = y[l:r]
X_train["attention_mask"] = X["attention_mask"][l:r]
X_train["input_ids"] = X["input_ids"][l:r]
X_train["token_type_ids"] = X["token_type_ids"][l:r]
X_train["meta_info"] = X["meta_info"][l:r]
X_train["attention_mask"] = tf.cast(X_train["attention_mask"], dtype="float32")

In [None]:
X_test = {}
y_test = y[:]
X_test["attention_mask"] = tf.identity(X["attention_mask"][:])
X_test["input_ids"] = tf.identity(X["input_ids"][:])
X_test["token_type_ids"] = tf.identity(X["token_type_ids"][:])
X_test["meta_info"] = tf.identity(X["meta_info"][:])
X_test["attention_mask"] = tf.cast(X_test["attention_mask"], dtype="float32")

In [None]:
X_test["input_ids"] = X_test["input_ids"].numpy()

In [None]:
def mask_text(X, max_len=512) :
    X["sample_weights"] = np.zeros_like(X["input_ids"])
    for i in range(len(X["attention_mask"])) :
        inp_mask = (np.random.rand(max_len) < 0.15) & X["attention_mask"][i].numpy().astype(bool)
        X["sample_weights"][i][inp_mask] = 1
        ids = set(tokenizer.all_special_ids)
        for j in range(max_len) :
            if X["input_ids"][i][j] in ids :
                inp_mask[j] = False
        inp_mask2 = inp_mask & (np.random.rand(max_len) < 0.80)
        X["input_ids"][i][inp_mask2] = tokenizer.mask_token_id
        random_token_mask = inp_mask2 & (np.random.rand(max_len) < 1 / 10)

        X["input_ids"][i][random_token_mask] = np.random.randint(104, tokenizer.vocab_size, int(np.sum(random_token_mask)))
    return X

In [None]:
import copy

In [None]:
X_train = mask_text(X_train)

In [None]:
X_test_m = mask_text(copy.copy(X_test))

In [None]:
e_test = {}
e_test["attention_mask"] = X["attention_mask"][0:1]
e_test["input_ids"] = X["input_ids"][0:1]
e_test["token_type_ids"] = X["token_type_ids"][0:1]
e_test["meta_info"] = X["meta_info"][0:1]
e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")

In [None]:
e_test2 = {}
e_test2["attention_mask"] = X["attention_mask"][0:1]
e_test2["input_ids"] = X["input_ids"][0:1]
e_test2["token_type_ids"] = X["token_type_ids"][0:1]
e_test2["attention_mask"] = tf.cast(e_test2["attention_mask"], dtype="float32")

In [None]:
tf.random.set_seed(42)

In [1106]:
model2 = MLModel(GraphMoEBERT(12, 2, 4, 4, 256, 256 * 4, 512, tokenizer.vocab_size), tokenizer.vocab_size)

NameError: name 'GraphMoEBERT' is not defined

In [1044]:
model = MLModel(encoder, tokenizer.vocab_size, is_h=True)

In [None]:
model3 = ClassificationTaskModel(model2.curModel, len(embData))
model3.compile(tf.keras.optimizers.Adam(learning_rate=2e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy())

In [None]:
model.set_trainable_stat(False)

In [1045]:
y_pred = model(e_test2)
y_pred

<tf.Tensor: shape=(1, 512, 30522), dtype=float32, numpy=
array([[[3.3546999e-05, 3.0846986e-05, 3.2323336e-05, ...,
         3.5555688e-05, 3.9568637e-05, 3.1528245e-05],
        [3.6782716e-05, 3.0768839e-05, 3.8683294e-05, ...,
         3.7187947e-05, 3.1537496e-05, 3.1305201e-05],
        [3.2484884e-05, 3.5742290e-05, 3.2092008e-05, ...,
         3.9296880e-05, 3.1018637e-05, 2.9901610e-05],
        ...,
        [1.0000310e+00, 3.0645322e-05, 3.3079534e-05, ...,
         3.5044017e-05, 3.7424950e-05, 2.9608618e-05],
        [1.0000317e+00, 2.9540355e-05, 3.4866945e-05, ...,
         3.7372785e-05, 3.5249239e-05, 2.8714383e-05],
        [1.0000316e+00, 2.6373915e-05, 3.3662720e-05, ...,
         3.7593363e-05, 4.3962958e-05, 3.2137945e-05]]], dtype=float32)>

In [None]:
y_pred = model2.predict(e_test)
y_pred

In [None]:
model.load_weights("bert_weights_arxiv_1550k.hdf5")

In [None]:
model2.set_pretrained(model.curModel)

In [None]:
for i in range(len(model2.curModel.MoEBERT.bertLayers)) :
    model2.curModel.MoEBERT.bertLayers[i].trainable=False

In [None]:
model2.denseOutput.set_weights(model.denseOutput.weights)

In [758]:
def perplexity(model, x, y) :
    curLen = 0
    ls = 0
    for i in range(512) :
        if y[0][i] == 102 :
            curLen = i + 1
            break

    for k in tqdm.tqdm(range(1, curLen)) :
        prev_id = x["input_ids"][0][k]
        x["input_ids"][0][k] = 103
        y_pred = model.predict(x)

        curLoss = tf.keras.losses.sparse_categorical_crossentropy(y[0, k], y_pred[0, k])
        ls+= curLoss
        print(ls)

        x["input_ids"][0][k] = prev_id
    return tf.exp(ls / (curLen - 1))

In [None]:
model2.compile(tf.keras.optimizers.Adam(learning_rate=2e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE))

In [None]:
model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE))

In [None]:
model.set_trainable_stat(True)

In [None]:
tb_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs/MoEBERT_T")

In [None]:
X_train_cls = copy.copy(X_train)
X_train_cls.pop("sample_weights")

In [None]:
y_train_cls = []
for x in tqdm.tqdm(X_train_cls["meta_info"]) :
    for i in range(len(embData)) :
        if np.all(embData[i] == x) :
            y_train_cls.append(i)
            break

In [None]:
y_train_cls = np.array(y_train_cls)

In [None]:
y_train_cls_s = np.copy(y_train_cls)

In [None]:
rng = np.random.default_rng()

In [None]:
rng.shuffle(y_train_cls_s)
for i in tqdm.tqdm(range(len(X_train_cls["meta_info"]))) :
    X_train_cls["meta_info"][i] = embData[y_train_cls_s[i]]

In [None]:
model3.set_trainable_stat(True)

In [None]:
model3.fit(X_train_cls, y_train_cls_s, batch_size=8, epochs=1)

In [None]:
model.save_weights("bert_weights_arxiv_1550k.hdf5")

In [None]:
loss1 = []
loss2 = []
for i in tqdm.tqdm(range(2500, 3750)) :
    e_test = {}
    e_test["attention_mask"] = X_train["attention_mask"][i * 8:(i+1) * 8]
    e_test["input_ids"] = X_train["input_ids"][i * 8:(i+1) * 8]
    e_test["token_type_ids"] = X_train["token_type_ids"][i * 8:(i+1) * 8]
    e_test["meta_info"] = X_train["meta_info"][i * 8:(i+1) * 8]
    e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")
    e_test["sample_weights"] = X_train["sample_weights"][i * 8:(i+1) * 8]
    batch_y_train = y_train[i * 8:(i+1) * 8]

    e_test_cls = copy.copy(e_test)
    e_test_cls["meta_info"] = X_train_cls["meta_info"][i * 8:(i+1) * 8]
    e_test_cls.pop("sample_weights")

    batch_y_train_cls = y_train_cls_s[i * 8:(i+1) * 8]

    loss1.append(model2.train_on_batch(e_test, batch_y_train))
    loss2.append(model3.train_on_batch(e_test_cls, batch_y_train_cls))

    lb = min(32, len(loss1))

    print("mlm loss:", np.mean(loss1[-lb:]), "cls loss:", np.mean(loss2[-lb:]))

In [None]:
model2.curModel.set_weights(model3.curModel.weights)

In [None]:
model3.curModel.set_weights(model2.curModel.weights, steps_per_epoch=125)

In [None]:
model2.fit(X_train, y_train, batch_size=8, epochs=1)

In [None]:
import copy
X_train_bert = copy.copy(X_train)
X_train_bert.pop("meta_info")

In [None]:
model.fit(X_train_bert, y_train, batch_size=8, epochs=1)

In [None]:
model2.save_weights("moebert_weights_arxiv_1650k_v4.hdf5")

In [None]:
import copy
X_test_bert = copy.copy(X_test_m)
X_test_bert.pop("meta_info")

In [None]:
def evaluate_model(model, x, y, sz) :
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    loss_mean = tf.keras.metrics.Mean()
    loss = 0
    for i in tqdm.tqdm(range(sz)) :
        e_test = {}
        e_test["attention_mask"] = x["attention_mask"][i:i+1]
        e_test["input_ids"] = x["input_ids"][i:i+1]
        e_test["token_type_ids"] = x["token_type_ids"][i:i+1]
        if "meta_info" in x :
            e_test["meta_info"] = x["meta_info"][i:i+1]
        e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")

        y_pred = model.predict(e_test)

        curLoss = loss_mean(loss_fn(y[i:i+1], y_pred, sample_weight=x["sample_weights"][i:i+1]), sample_weight=x["sample_weights"][i:i+1])
        loss += curLoss
    
    return loss / sz

In [None]:
evaluate_model(model2, X_test_m, y_test, 100)

In [None]:
tf.keras.backend.clear_session()

In [None]:
e_test = encode_input("this paper", tf.expand_dims(embData[63], 0))

In [None]:
i = 0
e_test = {}
Xinp = X_train
yinp = y_train
e_test["attention_mask"] = Xinp["attention_mask"][i:i+1]
e_test["input_ids"] = Xinp["input_ids"][i:i+1] #np.copy(yinp[i:i+1]).astype(np.int32)
e_test["token_type_ids"] = Xinp["token_type_ids"][i:i+1]
e_test["meta_info"] = Xinp["meta_info"][i:i+1]
e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")
e_test_y = yinp[i:i+1]

In [None]:
e_test2 = copy.copy(e_test)
e_test2.pop("meta_info")

In [None]:
metaInd = 1
print(relvsS[metaInd])
e_test["meta_info"] = tf.expand_dims(embData[metaInd], 0)

In [None]:
model2(e_test)[1]

In [None]:
y_pred = model2.predict(e_test)

In [None]:
y_pred = model.predict(e_test2)

In [None]:
tf.keras.losses.SparseCategoricalCrossentropy()(e_test_y, y_pred).numpy()

In [None]:
for i in range(100) :
    e_test["meta_info"] = tf.expand_dims(embData[i], 0)
    print(tf.keras.losses.SparseCategoricalCrossentropy()(e_test_y, model2.predict(e_test)).numpy())

In [None]:
perplexity(model2, e_test, e_test_y)

In [None]:
tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1)))

In [None]:
tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1)))

In [None]:
tokenizer.decode(np.squeeze(e_test["input_ids"]))

In [None]:
tokenizer.decode(e_test_y[0])

In [None]:
"We propose a new method in simple"

In [None]:
for j in range(0, 100) :
    curStr = "We propose a new method in star"
    print(j, relvsS[j])
    for i in range(7, 20) :
        e_test = encode_input(curStr, tf.expand_dims(embData[j], 0))
        y_pred = model2.predict(e_test)
        curStr = tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1))[1:i + 2])
    print(curStr)

# Reddit processing

In [2023]:
#!g1.1
embData = []
with open("reddit_0.02.embeddings", "r") as f :
    n, m = f.readline().split()
    n = int(n)
    m = int(m)
    embData = np.zeros((n, m))
    for i in range(n) :
        nums = f.readline().split()
        v = int(nums[0]) - 1
        for j in range(m) :
            embData[v][j] = float(nums[1 + j])

In [2024]:
#!g1.1
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [2025]:
#!g1.1
import random
import tqdm
import json

In [2026]:
#!g1.1
subrs = json.loads(open("proposed_subrs.json", "r").read())[1:]
len(subrs)

4412

In [2027]:
#!g1.1
data2 = pd.read_csv("2001/reddit_dump_tiny.csv")

In [2028]:
#!g1.1
def extract_reddit_data(data2, subrs, embData) :
    subr_dict = dict()
    for i in range(len(subrs)) :
        subr_dict[subrs[i]] = i
    print("Preprocessing reddit data")
    metaExists = data2["subreddit_id"].map(lambda x: x in subr_dict).values
    X = list(data2["body"].iloc[metaExists].values)
    metaX = data2["subreddit_id"].iloc[metaExists].map(lambda x: embData[subr_dict[x]]).values

    return X, metaX

In [2029]:
#!g1.1
def prepare_data(X, metaX, max_len=512) :
    encoded_input = tokenizer(X, padding=True, pad_to_multiple_of=max_len, return_tensors="tf", truncation=True, max_length=max_len)
    encoded_input["attention_mask"] = tf.cast(encoded_input["attention_mask"], dtype="float32")
    encoded_input["input_ids"] = encoded_input["input_ids"].numpy()
    encoded_input["meta_info"] = np.zeros((len(X), len(metaX[0])))
    y = []
    y = np.zeros((len(X), max_len))
    print("Post-processing")
    for i in tqdm.tqdm(range(len(X))) :
        encoded_input["meta_info"][i] = metaX[i]
        y[i] = encoded_input["input_ids"][i]
    
    return encoded_input, y

In [2030]:
#!g1.1
def encode_input(input_str, meta_info = np.zeros((1, 64)), do_mask=True) :
    encoded_input = tokenizer([input_str], padding=True, pad_to_multiple_of=512, return_tensors="tf", truncation=True, max_length=512)
    encoded_input["attention_mask"] = tf.cast(encoded_input["attention_mask"], dtype="float32")
    encoded_input["input_ids"] = encoded_input["input_ids"].numpy()
    encoded_input["meta_info"] = meta_info
    if do_mask == True :
        encoded_input["input_ids"][:, int(tf.reduce_sum(encoded_input.attention_mask)) - 1:] = tokenizer.mask_token_id
        encoded_input.attention_mask = tf.ones_like(encoded_input.attention_mask)

    return encoded_input.data

In [2031]:
#!g1.1
data2 = data2[["body", "subreddit_id"]].dropna()

In [2032]:
#!g1.1
Xdata, metaX = extract_reddit_data(data2, subrs, embData)

Preprocessing reddit data


In [2033]:
#!g1.1
len(Xdata)

868710

In [2034]:
#!g1.1
l = 000000
r = 100
X, y = prepare_data(Xdata[l:r], metaX[l:r])

100%|██████████| 100/100 [00:00<00:00, 122892.00it/s]

Post-processing





In [2035]:
#!g1.1
l = 0
r = 100000
X_train = {}
y_train = y[l:r]
X_train["attention_mask"] = X["attention_mask"][l:r]
X_train["input_ids"] = X["input_ids"][l:r]
X_train["token_type_ids"] = X["token_type_ids"][l:r]
X_train["meta_info"] = X["meta_info"][l:r]
X_train["attention_mask"] = tf.cast(X_train["attention_mask"], dtype="float32")

In [1940]:
#!g1.1
X_test = {}
y_test = y[:]
X_test["attention_mask"] = tf.identity(X["attention_mask"][:])
X_test["input_ids"] = tf.identity(X["input_ids"][:])
X_test["token_type_ids"] = tf.identity(X["token_type_ids"][:])
X_test["meta_info"] = tf.identity(X["meta_info"][:])
X_test["attention_mask"] = tf.cast(X_test["attention_mask"], dtype="float32")

In [1941]:
#!g1.1
X_test["input_ids"] = X_test["input_ids"].numpy()

In [2036]:
#!g1.1
def mask_text(X, max_len=512) :
    X["sample_weights"] = np.zeros_like(X["input_ids"])
    for i in range(len(X["attention_mask"])) :
        inp_mask = (np.random.rand(max_len) < 0.15) & X["attention_mask"][i].numpy().astype(bool)
        X["sample_weights"][i][inp_mask] = 1
        ids = set(tokenizer.all_special_ids)
        for j in range(max_len) :
            if X["input_ids"][i][j] in ids :
                inp_mask[j] = False
        inp_mask2 = inp_mask & (np.random.rand(max_len) < 0.80)
        X["input_ids"][i][inp_mask2] = tokenizer.mask_token_id
        random_token_mask = inp_mask2 & (np.random.rand(max_len) < 1 / 10)

        X["input_ids"][i][random_token_mask] = np.random.randint(104, tokenizer.vocab_size, int(np.sum(random_token_mask)))
    return X

In [2037]:
#!g1.1
import copy

In [2006]:
#!g1.1
X_train = mask_text(X_train)



In [1942]:
#!g1.1
X_test_m = mask_text(copy.copy(X_test))

In [2038]:
#!g1.1
e_test = {}
e_test["attention_mask"] = X["attention_mask"][0:1]
e_test["input_ids"] = X["input_ids"][0:1]
e_test["token_type_ids"] = X["token_type_ids"][0:1]
e_test["meta_info"] = X["meta_info"][0:1]
e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")

In [2039]:
#!g1.1
e_test2 = {}
e_test2["attention_mask"] = X["attention_mask"][0:1]
e_test2["input_ids"] = X["input_ids"][0:1]
e_test2["token_type_ids"] = X["token_type_ids"][0:1]
e_test2["attention_mask"] = tf.cast(e_test2["attention_mask"], dtype="float32")

In [2040]:
#!g1.1
tf.random.set_seed(42)

In [2041]:
#!g1.1
model2 = MLModel(GraphMoEBERT(4, 2, 4, 4, 256, 256 * 4, 512, tokenizer.vocab_size), tokenizer.vocab_size)



In [2042]:
#!g1.1
model = MLModel(encoder, tokenizer.vocab_size, is_h=True)



In [None]:
model.set_trainable_stat(False)

In [2043]:
#!g1.1
y_pred = model(e_test2)
y_pred

<tf.Tensor: shape=(1, 512, 30522), dtype=float32, numpy=
array([[[3.4868066e-05, 3.3568816e-05, 3.4127141e-05, ...,
         3.7567948e-05, 3.3197080e-05, 2.7782582e-05],
        [3.6004571e-05, 3.2312168e-05, 2.9362760e-05, ...,
         3.8603932e-05, 3.3711192e-05, 2.8358889e-05],
        [3.4523102e-05, 3.3608780e-05, 3.3707955e-05, ...,
         4.0261770e-05, 3.2801290e-05, 3.1159710e-05],
        ...,
        [1.0000329e+00, 3.5555397e-05, 3.1964504e-05, ...,
         3.2998643e-05, 3.4561584e-05, 2.5751426e-05],
        [1.0000328e+00, 3.5415105e-05, 3.1839867e-05, ...,
         3.2620399e-05, 3.3585318e-05, 2.5916288e-05],
        [1.0000348e+00, 3.3478045e-05, 3.2927779e-05, ...,
         3.6631329e-05, 3.3802266e-05, 2.6661508e-05]]], dtype=float32)>



In [2044]:
#!g1.1
y_pred = model2.predict(e_test)
y_pred

<tf.Tensor: shape=(1, 512, 30522), dtype=float32, numpy=
array([[[2.8291848e-05, 3.5563466e-05, 3.4831824e-05, ...,
         3.7278267e-05, 3.6776040e-05, 2.6583304e-05],
        [3.0053679e-05, 3.6438047e-05, 3.4744386e-05, ...,
         3.4995610e-05, 3.1913602e-05, 3.3607728e-05],
        [2.9408602e-05, 3.9903214e-05, 3.3586810e-05, ...,
         3.8392103e-05, 3.3976168e-05, 3.1063104e-05],
        ...,
        [1.0000328e+00, 3.2763252e-05, 3.2763252e-05, ...,
         3.2763252e-05, 3.2763252e-05, 3.2763252e-05],
        [1.0000328e+00, 3.2763252e-05, 3.2763252e-05, ...,
         3.2763252e-05, 3.2763252e-05, 3.2763252e-05],
        [1.0000328e+00, 3.2763252e-05, 3.2763252e-05, ...,
         3.2763252e-05, 3.2763252e-05, 3.2763252e-05]]], dtype=float32)>

In [2045]:
#!g1.1
model.load_weights("bert_weights_reddit_3200k_v3.hdf5")

In [1946]:
#!g1.1
model2.load_weights("moebert_2200k_4_topics.hdf5")



In [1980]:
#!g1.1
model.save_weights("bert_weights_reddit_3200k_v3.hdf5")



In [635]:
#!g1.1
from numpy import array

In [None]:
#!g1.1
model2.set_pretrained(model.curModel)

In [1893]:
#!g1.1
for i in range(len(model2.curModel.MoEBERT.bertLayers)) :
    model2.curModel.MoEBERT.bertLayers[i].trainable=False

In [1216]:
#!g1.1
model2.curModel.MoEBERT.bertLayers[-1].trainable=True

In [None]:
#!g1.1
model2.denseOutput.set_weights(model.denseOutput.weights)

In [2046]:
#!g1.1
def perplexity(model, x, y) :
    curLen = 0
    ls = 0
    for i in range(512) :
        if y[0][i] == 102 :
            curLen = i + 1
            break

    for k in tqdm.tqdm(range(1, curLen)) :
        prev_id = x["input_ids"][0][k]
        x["input_ids"][0][k] = 103
        y_pred = model.predict(x)

        curLoss = tf.keras.losses.sparse_categorical_crossentropy(y[0, k], y_pred[0, k])
        ls+= curLoss

        x["input_ids"][0][k] = prev_id
    return tf.exp(ls / (curLen - 1))

In [1894]:
#!g1.1
model2.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE))

In [2047]:
#!g1.1
model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE))

In [None]:
model.set_trainable_stat(True)

In [None]:
tb_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs/MoEBERT_T")

In [1792]:
#!g1.1
rng = np.random.default_rng()
rng.shuffle(X_train["meta_info"], axis=0)

In [1924]:
#!g1.1
model2.fit(X_train, y_train, batch_size=8, epochs=1)



<keras.callbacks.History at 0x7ff0eb318a30>



In [2012]:
#!g1.1
import copy
X_train_bert = copy.copy(X_train)
X_train_bert.pop("meta_info")

array([[ 5.6652473e-03,  5.4641550e-03, -5.8550980e-03, ...,
        -8.2636974e-04, -3.7490644e-03, -1.5158061e-03],
       [-6.1945156e-03,  7.4199940e-03,  2.8067748e-03, ...,
        -7.0403786e-03, -1.2712122e-03, -1.6435687e-03],
       [-1.2236797e+00,  1.4707482e+00, -4.3001652e-01, ...,
         8.7342850e-01, -1.0077835e+00,  6.6953330e-02],
       ...,
       [ 4.2996164e-03,  7.1728574e-03,  1.7592588e-03, ...,
         8.0083957e-04,  1.2772186e-03, -2.3365791e-03],
       [ 1.1990578e+00, -9.9521357e-01,  1.0145837e+00, ...,
         2.0887601e-01,  1.0673581e-01,  7.8992110e-01],
       [-3.6683101e-03,  6.6304870e-03, -8.8916125e-04, ...,
        -1.8951952e-03,  4.6053124e-03,  5.7640290e-03]])

In [2014]:
#!g1.1
model.fit(X_train_bert, y_train, batch_size=8, epochs=1)

NameError: name 'model' is not defined

In [None]:
#!g1.1
for i in range(5):
    l = 100000 * i
    r = l + 100000
    X, y = prepare_data(Xdata[l:r], metaX[l:r])
    X_train = {}
    y_train = y[:]
    X_train["attention_mask"] = X["attention_mask"][:]
    X_train["input_ids"] = X["input_ids"][:]
    X_train["token_type_ids"] = X["token_type_ids"][:]
    X_train["meta_info"] = X["meta_info"][:]
    X_train["attention_mask"] = tf.cast(X_train["attention_mask"], dtype="float32")
    X_train = mask_text(X_train)
    X_train_bert = copy.copy(X_train)
    X_train_bert.pop("meta_info")
    
    model.fit(X_train_bert, y_train, batch_size=8, epochs=1)

 22%|██▏       | 21797/100000 [00:00<00:00, 217963.45it/s]

Post-processing


100%|██████████| 100000/100000 [00:00<00:00, 222442.23it/s]


In [2003]:
#!g1.1
np.sum(X_train["attention_mask"]) / 100000

36.41952

In [1927]:
#!g1.1
model2.save_weights("moebert_2200k_12_topics_new.hdf5")



In [1949]:
#!g1.1
import copy
X_test_bert = copy.copy(X_test_m)
X_test_bert.pop("meta_info")

<tf.Tensor: shape=(10000, 64), dtype=float64, numpy=
array([[-3.4292516e-01, -1.8143463e+00, -4.4616110e-01, ...,
        -1.0859780e+00, -2.3399197e-01, -2.4376360e+00],
       [-5.6006855e-01, -4.0428200e-01,  2.6992640e-01, ...,
        -1.4965070e+00, -6.9368047e-01,  4.6438120e-01],
       [-8.5244070e-02, -2.0868374e-01,  3.1962147e-01, ...,
        -8.6283666e-01, -1.7619240e-01, -4.5285925e-01],
       ...,
       [-1.1718458e-04, -4.7975294e-03,  5.0611416e-04, ...,
         6.3879870e-03,  8.3671673e-04,  2.8885321e-03],
       [ 3.8091276e-02, -5.0505024e-01,  4.2351648e-01, ...,
        -3.6511976e-01, -2.3805277e-02,  1.5277122e+00],
       [ 9.3376076e-01,  9.8851970e-01,  4.5015827e-01, ...,
         1.0953324e+00,  5.1993750e-01,  2.9502120e+00]])>

In [1700]:
#!g1.1
def evaluate_model(model, x, y, sz) :
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    loss_mean = tf.keras.metrics.Mean()
    loss = 0
    for i in tqdm.tqdm(range(sz)) :
        e_test = {}
        e_test["attention_mask"] = x["attention_mask"][i:i+1]
        e_test["input_ids"] = x["input_ids"][i:i+1]
        e_test["token_type_ids"] = x["token_type_ids"][i:i+1]
        if "meta_info" in x :
            e_test["meta_info"] = x["meta_info"][i:i+1]
        e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")

        y_pred = model.predict(e_test)

        curLoss = loss_mean(loss_fn(y[i:i+1], y_pred, sample_weight=x["sample_weights"][i:i+1]), sample_weight=x["sample_weights"][i:i+1])
        loss += curLoss
    
    return loss / sz

def evaluate_perplexity(model, x, y, sz) :
    ppl = 0
    for i in tqdm.tqdm(range(sz)) :
        e_test = {}
        e_test["attention_mask"] = x["attention_mask"][i:i+1]
        e_test["input_ids"] = x["input_ids"][i:i+1]
        e_test["token_type_ids"] = x["token_type_ids"][i:i+1]
        if "meta_info" in x :
            e_test["meta_info"] = x["meta_info"][i:i+1]
        e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")
        if np.sum(e_test["attention_mask"]) < 100 :
            sz-=1
            continue
        ppl+=perplexity(model, e_test, y[i:i+1])
    
    return ppl / sz

In [None]:
#!g1.1
evaluate_perplexity(model2, X_test, y_test, 100)

In [None]:
#!g1.1
evaluate_model(model, X_test_bert, y_test, 100)

In [None]:
tf.keras.backend.clear_session()

In [None]:
e_test = encode_input("this paper", tf.expand_dims(embData[63], 0))

In [1867]:
#!g1.1
i = 0
e_test = {}
Xinp = X_test
yinp = y_test
e_test["attention_mask"] = Xinp["attention_mask"][i:i+1]
e_test["input_ids"] = np.copy(yinp[i:i+1]).astype(np.int32)# Xinp["input_ids"][i:i+1] #np.copy(yinp[i:i+1]).astype(np.int32)#
e_test["token_type_ids"] = Xinp["token_type_ids"][i:i+1]
e_test["meta_info"] = Xinp["meta_info"][i:i+1]
e_test["attention_mask"] = tf.cast(e_test["attention_mask"], dtype="float32")
e_test_y = yinp[i:i+1]
tokenizer.decode(np.squeeze(e_test["input_ids"]))

"[CLS] i agree, you have your personal responsibility to be prepared and to not get offended when you're legitimately called out. i am not here to hold your hand. read up on your class, the tactics. every instance has a guide. if you want story do storymode, in master mode with the increase in difficulty and added mechanics you're there with a different goal. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [1873]:
#!g1.1
e_test2 = copy.copy(e_test)
e_test2.pop("meta_info")

<tf.Tensor: shape=(1, 64), dtype=float64, numpy=
array([[-0.00777813,  0.0071466 , -0.00050397, -0.00368676,  0.00344212,
        -0.00148632, -0.0050067 ,  0.00142168,  0.00196621,  0.00148539,
        -0.00403896, -0.00528146,  0.00213338,  0.00769987, -0.00671866,
        -0.0046249 , -0.00664762,  0.00512588, -0.00221587, -0.00375469,
         0.00751253, -0.00462278, -0.00084058,  0.00748602, -0.0015436 ,
        -0.0015507 , -0.00686841, -0.00129243, -0.0068223 ,  0.00515729,
        -0.00690322, -0.00274108,  0.00157256,  0.00236793, -0.00521422,
        -0.00026875,  0.00722196, -0.00140778,  0.00504701, -0.00044573,
        -0.0044426 , -0.00130919, -0.00593262, -0.00469751,  0.00041233,
        -0.00359952,  0.00666726, -0.00623629, -0.00094124, -0.00418846,
         0.00522944,  0.00684066,  0.00297584, -0.00105041,  0.00070968,
         0.00535298,  0.00227766, -0.00506398,  0.0041305 ,  0.00460835,
         0.0015106 ,  0.00425638,  0.00059343,  0.00622853]])>

In [1331]:
#!g1.1
metaInd = 4
print(subrs[metaInd])
e_test["meta_info"] = tf.expand_dims(embData[metaInd], 0)

t5_10e2b7


In [1868]:
#!g1.1
model2(e_test)[1]

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.27748647, 0.29121837, 0.2512723 , 0.18002288]], dtype=float32)>



In [1869]:
#!g1.1
y_pred = model2.predict(e_test)



In [1870]:
#!g1.1
tf.keras.losses.SparseCategoricalCrossentropy()(e_test_y, y_pred).numpy()

0.69961023



In [1874]:
#!g1.1
y_pred = model.predict(e_test2)



In [1875]:
#!g1.1
tf.keras.losses.SparseCategoricalCrossentropy()(e_test_y, y_pred).numpy()

0.7121982

In [None]:
#!g1.1
for i in range(100) :
    e_test["meta_info"] = tf.expand_dims(embData[i], 0)
    print(tf.keras.losses.SparseCategoricalCrossentropy()(e_test_y, model2.predict(e_test)).numpy())

In [None]:
#!g1.1
perplexity(model2, e_test, e_test_y)

In [None]:
#!g1.1
perplexity(model, e_test2, e_test_y)

In [280]:
#!g1.1
tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1)))

'[CLS] holy shit, i had no idea. dooku played the conner in ww2. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [None]:
#!g1.1
tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1)))

In [None]:
#!g1.1
tokenizer.decode(np.squeeze(e_test["input_ids"]))

In [None]:
#!g1.1
tokenizer.decode(e_test_y[0])

In [None]:
"We propose a new method in simple"

In [1883]:
#!g1.1
for j in range(0, 20) :
    curStr = ""
    print(j, subrs[j])
    if subrs[j] in matcher :
        print(matcher[subrs[j]])
    for i in range(1, 10) :
        e_test = encode_input(curStr, tf.expand_dims(embData[j], 0))
        y_pred = model2.predict(e_test)
        curStr = tokenizer.decode(np.squeeze(np.argmax(y_pred, axis=-1))[1:i + 2])
    print(curStr)

0 t5_2qh1i
AskReddit
white white white white white white white white white white
1 t5_3j2jr
CasualUK
white white white white white white white white white white
2 t5_2tycb
OldSchoolCool
white white white white white white white white white white
3 t5_38qlm
UberEATS
white white white white white white white white white white
4 t5_2qt55
gifs
white white white white white white white white white white
5 t5_ac643
FortniteCompetitive
white white white white white white white white white white
6 t5_2qmeb
pokemon
white white white white white white white white white white
7 t5_2rluz
hardstyle
white white white white white white white white white white
8 t5_2qjpg
memes
white white white white white white white white white white
9 t5_2qh3v
bestof
white white white white white white white white white white
10 t5_2qm9d
CFB
white white white white white white white white white white
11 t5_2v08j
NSFWIAMA
white white white white white white white white white white
12 t5_2sgp1
pcmasterrace
white whit



In [None]:
#!g1.1


# Train SARC models


In [384]:
#!g1.1
%bzip2 -d comments.json.bz2

UsageError: Line magic function `%bzip2` not found.


UsageError: Line magic function `%bzip2` not found.

In [1339]:
#!g1.1
import bz2
sarcData = json.loads(bz2.open("comments.json.bz2", "r").read())

In [1340]:
#!g1.1
matcher = json.loads(open("reddit_matcher.json", "r").read())

In [1341]:
#!g1.1
matcherr = dict(zip(matcher.values(), matcher.keys()))

In [1353]:
#!g1.1
lines = open("train-balanced.csv", "r").readlines()

In [1354]:
#!g1.1
subr_dict = dict()
for i in range(len(subrs)) :
    subr_dict[subrs[i]] = i

In [1355]:
#!g1.1
sarcX = []
sarcMeta = []
sarcY = []
for line in lines :
    _, comms, scores = line.split(sep='|')
    comms = comms.split()
    scores = scores.split()
    for i in range(len(comms)) :
        comm = sarcData[comms[i]]
        if not (comm['subreddit'] in matcherr) :
            continue
        
        sarcX.append(comm["text"])
        sarcMeta.append(embData[subr_dict[matcherr[comm["subreddit"]]]])
        sarcY.append(float(scores[i]))

In [1356]:
#!g1.1
def prepare_sarc_data(X, metaX, max_len=512) :
    encoded_input = tokenizer(X, padding=True, pad_to_multiple_of=max_len, return_tensors="tf", truncation=True, max_length=max_len)
    encoded_input["attention_mask"] = tf.cast(encoded_input["attention_mask"], dtype="float32")
    encoded_input["input_ids"] = encoded_input["input_ids"].numpy()
    encoded_input["meta_info"] = np.zeros((len(X), len(metaX[0])))
    print("Post-processing")
    for i in tqdm.tqdm(range(len(X))) :
        encoded_input["meta_info"][i] = metaX[i]
    
    return encoded_input

In [1447]:
#!g1.1
l = 100000
r = 200000

In [1448]:
#!g1.1
sarcX_train_tmp = prepare_sarc_data(sarcX[l:r], sarcMeta[l:r])
sarcY_train = np.array(sarcY[l:r])

Post-processing


100%|██████████| 100000/100000 [00:00<00:00, 939460.06it/s]


In [1359]:
#!g1.1
sarcX_test = prepare_sarc_data(sarcX[-1000:], sarcMeta[-1000:])
sarcY_test = np.array(sarcY[-1000:])

Post-processing


100%|██████████| 1000/1000 [00:00<00:00, 609725.83it/s]


In [1360]:
#!g1.1
l = 0
r = 1000
sarcX_testt = {}
sarcX_testt["attention_mask"] = sarcX_test["attention_mask"][l:r]
sarcX_testt["input_ids"] = sarcX_test["input_ids"][l:r]
sarcX_testt["token_type_ids"] = sarcX_test["token_type_ids"][l:r]
sarcX_testt["meta_info"] = sarcX_test["meta_info"][l:r]
sarcX_testt["attention_mask"] = tf.cast(sarcX_testt["attention_mask"], dtype="float32")

In [1449]:
#!g1.1
l = 0
r = 100000
sarcX_train = {}
sarcX_train["attention_mask"] = sarcX_train_tmp["attention_mask"][l:r]
sarcX_train["input_ids"] = sarcX_train_tmp["input_ids"][l:r]
sarcX_train["token_type_ids"] = sarcX_train_tmp["token_type_ids"][l:r]
sarcX_train["meta_info"] = sarcX_train_tmp["meta_info"][l:r]
sarcX_train["attention_mask"] = tf.cast(sarcX_train["attention_mask"], dtype="float32")

In [1429]:
#!g1.1
model5 = BinaryClassificationTaskModel(model2.curModel)



In [1435]:
#!g1.1
model5.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])



In [1428]:
#!g1.1
model5.curModel.MoEBERT.bertLayers[0].trainable

True



In [1421]:
#!g1.1
for i in range(len(model5.curModel.MoEBERT.bertLayers)) :
    model5.curModel.MoEBERT.bertLayers[i].trainable = True



In [1451]:
#!g1.1
model5.fit(sarcX_train, sarcY_train, batch_size=8, epochs=1)



<keras.callbacks.History at 0x7f2d64014850>



In [1452]:
#!g1.1
model5.evaluate(sarcX_testt, sarcY_test, batch_size=8, steps=125)

NameError: name 'model5' is not defined

In [None]:
#!g1.1
l = 0
r = 1000
sarcX_testt = {}
sarcX_testt["attention_mask"] = sarcX_test["attention_mask"][l:r]
sarcX_testt["input_ids"] = sarcX_test["input_ids"][l:r]
sarcX_testt["token_type_ids"] = sarcX_test["token_type_ids"][l:r]
sarcX_testt["meta_info"] = sarcX_test["meta_info"][l:r]
sarcX_testt["attention_mask"] = tf.cast(sarcX_testt["attention_mask"], dtype="float32")

In [512]:
#!g1.1
preds = model5.predict(sarcX_testt)



In [1407]:
#!g1.1
model5.save_weights("model5_weights_v3_3.hdf5")



In [1414]:
#!g1.1
model5.load_weights("model5_weights_v3_3.hdf5")



In [None]:
#!g1.1
model5.evaluate(sarcX_train, sarcY_train, batch_size=8)

In [None]:
#!g1.1
