In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
     

In [6]:
from transformers import BertTokenizer,BertConfig, BertForSequenceClassification, AdamW

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Processing Labelled data

In [8]:
labeled_data = pd.read_csv('/content/drive/MyDrive/Q&A dataset labelled.csv')

In [9]:
labeled_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,subtopic extraction,Unnamed: 5,poc,Unnamed: 7,Unnamed: 8
0,keywords,label,Question,Answer,topic,subtopic,topic,subtopic,URL source
1,,ethics & reg,What is the goal of the Paris Agreement?,The goal of the Paris Agreement is to limit th...,decarbonization,emissions' 'carbon',decarbonization,Environmental Sustainability' 'Climate Policy',
2,,strategy,What is the role of AFOLU sector in achieving...,The Paris Agreement anchors the central and un...,decarbonization,emissions' 'carbon' 'renewables',decarbonization,Environmental Sustainability' 'Climate Policy'...,
3,,analysis,How much can AFOLU contribute to a 15C pathway...,Recent global assessments show that AFOLU can ...,decarbonization,emissions',decarbonization,Environmental Sustainability',
4,,Strategy,What are some potential tradeoffs when aiming ...,A contribution in the upper echelons of identi...,decarbonization,externalities' 'ecology' 'externalities' 'sust...,decarbonization,Behavioral Externalities' 'Eco-diversity' 'Sus...,


In [10]:
new_columns = {col: val for col, val in zip(labeled_data.columns, labeled_data.iloc[0])}

In [11]:
labeled_data = labeled_data.rename(columns=new_columns)

In [12]:
labeled_data.head()

Unnamed: 0,keywords,label,Question,Answer,topic,subtopic,topic.1,subtopic.1,URL source
0,keywords,label,Question,Answer,topic,subtopic,topic,subtopic,URL source
1,,ethics & reg,What is the goal of the Paris Agreement?,The goal of the Paris Agreement is to limit th...,decarbonization,emissions' 'carbon',decarbonization,Environmental Sustainability' 'Climate Policy',
2,,strategy,What is the role of AFOLU sector in achieving...,The Paris Agreement anchors the central and un...,decarbonization,emissions' 'carbon' 'renewables',decarbonization,Environmental Sustainability' 'Climate Policy'...,
3,,analysis,How much can AFOLU contribute to a 15C pathway...,Recent global assessments show that AFOLU can ...,decarbonization,emissions',decarbonization,Environmental Sustainability',
4,,Strategy,What are some potential tradeoffs when aiming ...,A contribution in the upper echelons of identi...,decarbonization,externalities' 'ecology' 'externalities' 'sust...,decarbonization,Behavioral Externalities' 'Eco-diversity' 'Sus...,


In [13]:
labeled_data = labeled_data[['Question', 'Answer', 'label']].copy()

In [14]:
labeled_data = labeled_data.drop(0)

In [15]:
labeled_data.head()

Unnamed: 0,Question,Answer,label
1,What is the goal of the Paris Agreement?,The goal of the Paris Agreement is to limit th...,ethics & reg
2,What is the role of AFOLU sector in achieving...,The Paris Agreement anchors the central and un...,strategy
3,How much can AFOLU contribute to a 15C pathway...,Recent global assessments show that AFOLU can ...,analysis
4,What are some potential tradeoffs when aiming ...,A contribution in the upper echelons of identi...,Strategy
5,What kinds of changes are needed from the AFOL...,In order for anthropogenic emissions versus re...,Strategy


In [16]:
print(labeled_data.isnull().sum())

Question     0
Answer       0
label       23
dtype: int64


In [17]:
labeled_data = labeled_data.dropna()

In [18]:
labeled_data['label'].unique()

array(['ethics & reg', 'strategy', 'analysis', 'Strategy', 'science&tech',
       'factual', 'management', 'taxonomies', 'strategic analysis', 'str',
       'Ethics & reg', 'ethics & reg, analysis', 'strategy, factual ',
       'ethic and reg', 'taxonomy', 'taxonomy, strategy',
       'strategy / ethics and reg', 'strategy/management', 'strategies',
       'factual, analysis ', 'factual, ', 'factual/science & tech',
       'analysis, strategy', 'factual / analysis', 'Analysis',
       'analysis | science & tech', 'Factual'], dtype=object)

Removing duplicate labels

In [19]:
labeled_data['label'] = labeled_data['label'].str.lower()

In [20]:
conditions = [    labeled_data['label'].str.contains('^ethic'),
    labeled_data['label'].str.contains('^taxonom'),
    labeled_data['label'].str.contains('^str'),
    labeled_data['label'].str.contains('^science'),
    labeled_data['label'].str.contains('^analy'),
    labeled_data['label'].str.contains('^fact')
]

In [21]:
outputs = [    'ethics and regularisation',    'taxonomy',    'strategy and management',    'science & technology',    'analysis',    'factual']

In [22]:
labeled_data['label'] = np.select(conditions, outputs, default=labeled_data['label'])

In [23]:
labeled_data['label'] = np.where(labeled_data['label'] == 'management', 'strategy and management', labeled_data['label'])

In [24]:
labeled_data['label'].unique()

array(['ethics and regularisation', 'strategy and management', 'analysis',
       'science & technology', 'factual', 'taxonomy'], dtype=object)

Processing Unlabelled data

In [25]:
unlabeled_data = pd.read_csv('/content/drive/MyDrive/Q&A dataset unlabelled.csv')

In [26]:
unlabeled_data = unlabeled_data[['Question', 'Answer']].copy()

In [27]:
unlabeled_data.head()

Unnamed: 0,Question,Answer
0,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...
1,How was the fertilization effect on phytoplank...,The fertilization effect on phytoplankton was ...
2,How do rising temperatures affect Alpine lakes?,Rising temperatures increase mineral weatherin...
3,How has an increase in phytoplankton biomass b...,Significant increase in phytoplankton biomass ...
4,How do higher metabolic rates of organisms and...,Higher metabolic rates of organisms and longer...


Data Preparation

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [29]:
tokenized = tokenizer.encode_plus(
    labeled_data['Question'].iloc[0], 
    labeled_data['Answer'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [30]:
tokenized.input_ids
     

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,  1327,  1110,  1103,  2273,  1104,  1103,  2123, 11225,
          136,   102,  1109,  2273,  1104,  1103,  2123, 11225,  1110,
         1106,  5310,  1103,  3606,  1104,  4265,  4143,  1106,  1218,
         2071,   123,  1658,  1105,  6799,  3268,  1106,  5310,  1122,
         1106,  1405,  1658,  1107,  1546,  1106,  3843,   178, 11604,
        24582, 12947,  2416,  1118, 22904,  8167,  4184, 17960,  4265,
        14110,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [31]:
X_input_ids = np.zeros((len(labeled_data), 256))
X_attn_masks = np.zeros((len(labeled_data), 256))

In [32]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, (question, answer) in tqdm(enumerate(zip(labeled_data['Question'], labeled_data['Answer']))):
        tokenized_text = tokenizer.encode_plus(
            question,
            answer,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks


In [33]:
X_input_ids, X_attn_masks = generate_training_data(labeled_data, X_input_ids, X_attn_masks, tokenizer)
     

0it [00:00, ?it/s]

In [34]:
label_map = {
    'ethics and regularisation': 0,
    'strategy and management': 1,
    'analysis': 2,
    'science & technology': 3,
    'factual': 4,
    'taxonomy': 5
}

In [35]:
labels = np.zeros((len(labeled_data), 6))
labels[np.arange(len(labeled_data)), labeled_data['label'].map(label_map).values] = 1 # one-hot encoded labels

In [36]:
labels

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [37]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(6,), dtype=tf.float64, name=None))>

In [38]:
def QADatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [39]:
dataset = dataset.map(QADatasetMapFunction) # converting to required format for tensorflow dataset 
     

In [40]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(6,), dtype=tf.float64, name=None))>

In [41]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [42]:
from transformers import TFBertModel
     

In [43]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [44]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(6, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

qa_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
qa_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [45]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [46]:
qa_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [47]:
hist = qa_model.fit(
    dataset, epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


unlabelled data

In [48]:
X_unlabeled_input_ids = np.zeros((len(unlabeled_data), 256))
X_unlabeled_attn_masks = np.zeros((len(unlabeled_data), 256))

In [49]:
X_unlabeled_input_ids, X_unlabeled_attn_masks = generate_training_data(unlabeled_data, X_unlabeled_input_ids, X_unlabeled_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [50]:
predicted_labels = qa_model.predict({'input_ids': X_unlabeled_input_ids, 'attention_mask': X_unlabeled_attn_masks})
predicted_labels = np.argmax(predicted_labels, axis=1)
one_hot_predicted_labels = np.zeros((len(unlabeled_data), 6))
one_hot_predicted_labels[np.arange(len(unlabeled_data)), predicted_labels] = 1



In [51]:
X_input_ids_combined = np.concatenate([X_input_ids, X_unlabeled_input_ids])
X_attn_masks_combined = np.concatenate([X_attn_masks, X_unlabeled_attn_masks])
labels_combined = np.concatenate([labels, one_hot_predicted_labels])

In [52]:
dataset_combined = tf.data.Dataset.from_tensor_slices((X_input_ids_combined, X_attn_masks_combined, labels_combined))
dataset_combined = dataset_combined.map(QADatasetMapFunction)
dataset_combined = dataset_combined.shuffle(10000).batch(16, drop_remainder=True)

In [53]:
hist = qa_model.fit(
    dataset_combined, epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# use the trained model to predict labels for combined data

In [70]:
data = {'Question': unlabeled_data['Question'],
        'Answer': unlabeled_data['Answer'],
        'Predicted Label': predicted_labels}

In [71]:
df = pd.DataFrame(data)

In [72]:
label_map_reverse = {v: k for k, v in label_map.items()}
df['Predicted Label'] = df['Predicted Label'].map(label_map_reverse)

In [73]:
df.head()

Unnamed: 0,Question,Answer,Predicted Label
0,What were the sources of atmospheric nutrients...,The primary sources of atmospheric nutrients t...,ethics and regularisation
1,How was the fertilization effect on phytoplank...,The fertilization effect on phytoplankton was ...,strategy and management
2,How do rising temperatures affect Alpine lakes?,Rising temperatures increase mineral weatherin...,analysis
3,How has an increase in phytoplankton biomass b...,Significant increase in phytoplankton biomass ...,strategy and management
4,How do higher metabolic rates of organisms and...,Higher metabolic rates of organisms and longer...,strategy and management


In [75]:
print(len(df))

3970


In [77]:
df.to_excel('predicted_labels.xlsx', index=False)