In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import glob
from nltk import tokenize
from transformers import BertTokenizer, TFBertModel, BertConfig
from transformers.utils.dummy_tf_objects import TFBertMainLayer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import convert_to_tensor
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
import numpy as np
import nltk
nltk.download('punkt')
# this is the input file
import pandas as pd
df=pd.read_csv("/content/abstract_only_doi.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Text,SDG,SDGs_All,Abstract,DOI,SDG-Predictions,Probabilities,Predictions,Primary SDG Predicted
0,10,[TITLE] The true costs of cesarean delivery fo...,1,"1, 3, 5, 10",Introduction While it is recognized that there...,10.1186/s12939-022-01664-x,"['1', '5', '10']","[0.44713575, 0.22672635, 0.1467481]","{'1': 0.44713575, '5': 0.22672635, '10': 0.146...",1
1,11,[TITLE] The Association of Race Ethnicity and ...,1,"1, 3, 16",OBJECTIVES To determine the role of raceethnic...,10.1542/peds.2021-053346,"['1', '10']","[0.7201732, 0.14859931]","{'1': 0.7201732, '10': 0.14859931}",1
2,12,[TITLE] Psychotic experiences among informal c...,1,"1, 3, 10",Purpose Informal caregivers may be at high ris...,10.1007/s00127-022-02312-z,"['5', '16']","[0.5642183, 0.20469989]","{'5': 0.5642183, '16': 0.20469989}",5
3,13,[TITLE] The application of GIS in homelessness...,1,"1, 10",GIS is increasingly popular in the study of co...,10.1016/j.healthplace.2022.102776,"['1', '10']","[0.47827235, 0.3172999]","{'1': 0.47827235, '10': 0.3172999}",1
4,14,[TITLE] To Punish Parent or Palliate Governing...,1,1,Studies of poverty governance typically emphas...,10.1177/00031224221116145,['1'],[0.8682397],{'1': 0.8682397},1


In [None]:
data=df[["SDG","SDGs_All","Abstract","DOI"]]
data.head()

Unnamed: 0,SDG,SDGs_All,Abstract,DOI
0,1,"1, 3, 5, 10",Introduction While it is recognized that there...,10.1186/s12939-022-01664-x
1,1,"1, 3, 16",OBJECTIVES To determine the role of raceethnic...,10.1542/peds.2021-053346
2,1,"1, 3, 10",Purpose Informal caregivers may be at high ris...,10.1007/s00127-022-02312-z
3,1,"1, 10",GIS is increasingly popular in the study of co...,10.1016/j.healthplace.2022.102776
4,1,1,Studies of poverty governance typically emphas...,10.1177/00031224221116145


In [None]:
text=data["Abstract"].tolist()

In [None]:
tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [None]:
encoded_dict={"1":1,"2":2,"3":3,"4":4,"5":5,
              "6":6,"7":7,"8":8,"9":9,"10":10,
              "11":11,"12":12,"13":13,"14":14,
              "15":15,"16":16,"17":17}
#model can be found at https://zenodo.org/record/7095784#.Y3vtH3bMKz5
# model is the same one used by Aurora in their program and made available on Zenodo
model=tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/sdgs_multiclass_16.h5")

In [None]:
# the methods defined here are taken from Aurora' github predict.py file.
#https://github.com/Aurora-Network-Global/sdgs_many_berts/blob/main/predict.py

def tokenize_abstracts(abstracts):
    t_abstracts=[]
    for abstract in abstracts:
        t_abstract="[CLS] "
        for sentence in tokenize.sent_tokenize(abstract):
            t_abstract=t_abstract + sentence + " [SEP] "
        t_abstracts.append(t_abstract)
    return t_abstracts

tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def b_tokenize_abstracts(t_abstracts, max_len=512):
    b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
    return b_t_abstracts

def convert_to_ids(b_t_abstracts):
    input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
    return input_ids


def abstracts_to_ids(abstracts):
    tokenized_abstracts=tokenize_abstracts(abstracts)
    b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
    ids=convert_to_ids(b_tokenized_abstracts)
    return ids

def pad_ids(input_ids, max_len=512):
    p_input_ids=pad_sequences(input_ids,
                              maxlen=max_len,
                              dtype="long",
                              truncating="post",
                              padding="post")
    return p_input_ids


def create_attention_masks(inputs):
    masks=[]
    for seq in inputs:
        seq_mask=[float(i>0) for i in seq]
        masks.append(seq_mask)
    return masks

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
# RUN

abstracts=text

ids=abstracts_to_ids(abstracts)

padded_ids=pad_ids(ids)

masks=create_attention_masks(padded_ids)

masks=convert_to_tensor(masks)

inputs=convert_to_tensor(padded_ids)

In [None]:
# getting probabilities with threshold 0.1
# getting all SDGs where probabilities >0.1
predictions=[]
for i in range(len(inputs)):
  sdg_dict={}
  inp=tf.reshape(inputs[i],[1,512])
  m=tf.reshape(masks[i],[1,512])
  validation = model.predict([inp,m])
  for key , value in zip(encoded_dict.keys(),validation[0]):
    if value > 0.1:
      sdg_dict[key]=value
  predictions.append(sdg_dict)



In [None]:
sdgs=[]
probabilities=[]
for prediction in predictions:
  sdgs.append(list(prediction.keys()))
  probabilities.append(list(prediction.values()))

In [None]:
data["SDG-Predictions"]=sdgs
data["Probabilities"]=probabilities
data["Predictions"]=predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data.head()

Unnamed: 0,SDG,SDGs_All,Abstract,DOI,SDG-Predictions,Probabilities,Predictions
0,1,"1, 3, 5, 10",Introduction While it is recognized that there...,10.1186/s12939-022-01664-x,"[1, 5, 10]","[0.45260605, 0.1461877, 0.26832375]","{'1': 0.45260605, '5': 0.1461877, '10': 0.2683..."
1,1,"1, 3, 16",OBJECTIVES To determine the role of raceethnic...,10.1542/peds.2021-053346,"[1, 10]","[0.7201732, 0.14859931]","{'1': 0.7201732, '10': 0.14859931}"
2,1,"1, 3, 10",Purpose Informal caregivers may be at high ris...,10.1007/s00127-022-02312-z,"[5, 16]","[0.5642183, 0.20469989]","{'5': 0.5642183, '16': 0.20469989}"
3,1,"1, 10",GIS is increasingly popular in the study of co...,10.1016/j.healthplace.2022.102776,"[1, 10]","[0.47827235, 0.3172999]","{'1': 0.47827235, '10': 0.3172999}"
4,1,1,Studies of poverty governance typically emphas...,10.1177/00031224221116145,[1],[0.8682397],{'1': 0.8682397}


In [None]:
#finding sdg with maximum probability
primary_sdgs=[]

for i in range(len(inputs)):
  sdg_dict={}
  inp=tf.reshape(inputs[i],[1,512])
  m=tf.reshape(masks[i],[1,512])
  validation = model.predict([inp,m])
  s=np.argmax(validation)
  s+=1
  primary_sdgs.append(s)



In [None]:
data["Primary SDG"]=primary_sdgs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,SDG,SDGs_All,Abstract,DOI,SDG-Predictions,Probabilities,Predictions,Primary SDG
0,1,"1, 3, 5, 10",Introduction While it is recognized that there...,10.1186/s12939-022-01664-x,"[1, 5, 10]","[0.45260605, 0.1461877, 0.26832375]","{'1': 0.45260605, '5': 0.1461877, '10': 0.2683...",1
1,1,"1, 3, 16",OBJECTIVES To determine the role of raceethnic...,10.1542/peds.2021-053346,"[1, 10]","[0.7201732, 0.14859931]","{'1': 0.7201732, '10': 0.14859931}",1
2,1,"1, 3, 10",Purpose Informal caregivers may be at high ris...,10.1007/s00127-022-02312-z,"[5, 16]","[0.5642183, 0.20469989]","{'5': 0.5642183, '16': 0.20469989}",5
3,1,"1, 10",GIS is increasingly popular in the study of co...,10.1016/j.healthplace.2022.102776,"[1, 10]","[0.47827235, 0.3172999]","{'1': 0.47827235, '10': 0.3172999}",1
4,1,1,Studies of poverty governance typically emphas...,10.1177/00031224221116145,[1],[0.8682397],{'1': 0.8682397},1


In [None]:
data.to_csv("/content/predictions_new.csv")