<h2> Introduction </h2>
This Jupyter Notebook last updated by Vijay Yevatkar (vjyvtkr@seas.upenn.edu) builds on work done by Shrivats Agrawal (shriv9@seas.upenn.edu) and Ssuying Chen (cssuying@umich.edu) and Emily Oxford (eoxford@umich.edu) to train a BERT model.

<h5> GDELT: </h5>
The GDELT Project (Global Database of Events, Language, and Tone) is the largest, most comprehensive, and highest resolution open database of human society as of now. It connects the world's people, locations, organizations, themes, counts, images, and emotions into a single holistic network over the globe. There are various analysis tools under the GDELT Analysis Service like Event Geographic Network, GKG Heatmap, GKG Word Cloud, and GKG Thematic Timeline.

<h2> Aim: </h2>
The goal of this script is to provide an effective solution to tag datasets fetched from the AWS GDelt database with the corresponding sectors. In that pursuit a Bidirectional Encoder Representations from Transformers (BERT) model is used for predictions.
 

## Importing Libraries

In [2]:
%%capture

!pip install pip --upgrade
!pip install torch
!pip install transformers
!pip install openpyxl
!pip install ipywidgets
!pip install smart_open

In [3]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

import boto3
from datetime import datetime
from tqdm.notebook import tqdm
import gc, io, re, time, openpyxl

from io import BytesIO 
from smart_open import open as smart_open

## User Inputs

In [4]:
# YEAR = "2015"
# MONTH = "03"

# #----For Reading -----
# S3_LOCATION = "ATHENA_BERT_RESULTS_AFRICA_2015_2023/" + YEAR + "/" + MONTH
# S3_BUCKET = 'sector-classification'
# S3_CLIENT = boto3.client('s3')

# #---For Saving------
# file_location = "Bert_Results_Africa_2015_2023"
# op_file_format = "parquet"

## Functions

In [5]:
# # Function to read parquet files and return a dataframe
# def read_parquet_file(bucket, file_key):
#     obj = S3_CLIENT.get_object(Bucket = bucket, Key = file_key)
#     return pd.read_parquet(io.BytesIO(obj['Body'].read()))


# #Return a list of all parquet files in the folder
# def return_all_files(bucket, location):
#     my_bucket = boto3.Session().resource('s3').Bucket(bucket)
#     files_list = []
#     for file in my_bucket.objects.filter(Prefix = location):
#         file_dict = {"bucket": file.bucket_name, "file_key": file.key}
#         files_list.append(file_dict)
#     return files_list


#Labels Dict
def get_labels_dict():
    file_key = "Data/sasb_full_training.xlsx"
    obj = S3_CLIENT.get_object(Bucket = S3_BUCKET, Key=file_key)

    sasb_data = pd.read_excel(io.BytesIO(obj['Body'].read()), engine = 'openpyxl')
    sasb_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    labels = sasb_data.Sector.unique()
    labels = np.delete(labels, 11)
    
    return {index: label for index, label in enumerate(sorted(labels))}


# #Function to clean and generate column for prediction
# def obtain_text(row):
#     replace_points=['.',"/","\n","https","http",":","www","  "]
#     source=str(row['sourcecommonname'])
#     doc_identifier=str(row['documentidentifier'])
#     themes=str(row['themes'])
#     for replace_symbol in replace_points:
#         source=source.replace(replace_symbol," ")
#         doc_identifier=doc_identifier.replace(replace_symbol," ")

#     try:
#         themes_text= " ".join(re.findall(".*?theme=(.*?),",themes))
#     except Exception as e:
#         themes_text = ""
  

#     pred_text=" ".join([source,doc_identifier,themes_text])
#     pred_text=pred_text.replace("_"," ")

#     text_list=pred_text.split(" ")
#     text_list_unique=list(dict.fromkeys(text_list))
#     pred_text= " ".join(text_list_unique[:200])
  
#     return pred_text[:1000]


#Assigning text labels to categorical numbers
# label_dict = get_labels_dict()
# def return_labels(label_number):
#     global label_dict
#     try:
#         return label_dict[label_number]
#     except Exception as e:
#         return ""


## Loading Pre_Trained Model

In [6]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels = len(label_dict),
#                                                       output_attentions = False,
#                                                       output_hidden_states = False)
# model.to(device)
# model.load_state_dict(torch.load('Bert_Models/second_finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                       num_labels = len(label_dict),
#                                                       output_attentions = False,
#                                                       output_hidden_states = False)
# model.to(device)


# load_path = "s3://zsguo/sagemaker-automated-execution/Bert_Models/second_finetuned_BERT_epoch_5.model"
# with smart_open(load_path, 'rb') as f:
#     buffer = io.BytesIO(f.read())
#     # model.load_state_dict(torch.load(buffer))
#     model.load_state_dict(torch.load(buffer, map_location=torch.device('cpu')))

# Classification of Sectors

In [None]:
# def make_predictions(df_batch, batch_size):
    
#     max_count = df_batch.shape[0]
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
#     predictions_list=[]  

#     for i in tqdm(range(0, max_count, batch_size)):
#         test_data = list(df_batch[i : i + batch_size]['pred_text'])
#         predicted = [-1 for x in range(len(test_data))]
#         try:
#             with torch.cuda.amp.autocast():
#                 encoded_data_test = tokenizer.batch_encode_plus(
#                     test_data[:batch_size],
#                     add_special_tokens=True, 
#                     return_attention_mask=True, 
#                     padding=True, 
#                     max_length=384,
#                     return_tensors='pt'
#                 )
#             input_ids_moderna = encoded_data_test['input_ids']
#             attention_masks_moderna = encoded_data_test['attention_mask']
#             output_moderna = model(input_ids_moderna.to(device))
#             _, predicted = torch.max(output_moderna[0], 1)
#             predicted = predicted.tolist()
#         except Exception as e:
#             print(e)

#         predictions_list.extend(predicted)

#         if(i%(50*batch_size)==0):
#             torch.cuda.empty_cache()
        
#     return predictions_list


# #Saving df to S3
# def save_df_to_s3(df, file_location, year, month, counter, file_format):
    
#     assert file_format in ("csv", "parquet"), "File format must be in {csv, parquet}"
    
#     file_name = f"{month}{year[-2:]}_{counter}.{file_format}"
#     file_key = file_location + "/" + year + "/" + month + "/" + file_name
#     if file_format == "csv":
#         with io.StringIO() as buffer:
#             df.to_csv(buffer, index=False)
#             response = S3_CLIENT.put_object(Bucket = S3_BUCKET, Key = file_key, Body = buffer.getvalue())
#     else:
#         with io.BytesIO() as buffer:
#             df.to_parquet(buffer, index=False)
#             response = S3_CLIENT.put_object(Bucket = S3_BUCKET, Key = file_key, Body = buffer.getvalue())
            
#     status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
#     success = "Successful" if status == 200 else "Unsuccessful"
#     print(f"\nCSV | {success} S3 put_object response. Status - {status}\n")

In [10]:


#Return a list of all parquet files in the folder
def return_all_files(bucket, location):
    my_bucket = boto3.Session().resource('s3').Bucket(bucket)
    files_list = []
    for file in my_bucket.objects.filter(Prefix = location):
        file_dict = {"bucket": file.bucket_name, "file_key": file.key}
        files_list.append(file_dict)
    return files_list


# Main

In [12]:
# Loop through all the years and months of data
for yr in range(2015, 2023):
    for mo in range(1, 13):
        if yr == 2015 and mo < 3:
            continue
        if yr == 2022 and mo > 10:
            continue
        
        if yr < 2020:
            continue
        
        if yr == 2020 and mo < 12:
            continue
            
        YEAR = str(yr)
        MONTH = '{:02d}'.format(mo)
        
        #----For Reading -----
        S3_LOCATION = "ATHENA_BERT_RESULTS_AFRICA_2015_2023/" + YEAR + "/" + MONTH
        S3_BUCKET = 'sector-classification'
        S3_CLIENT = boto3.client('s3')

        #---For Saving------
        file_location = "Bert_Results_Africa_2015_2023"
        op_file_format = "parquet"
        
        files_list = return_all_files(S3_BUCKET, S3_LOCATION)
        files_list = [file for file in files_list if file['file_key'] != S3_LOCATION + "/"]     # avoid folder itself

        batch_size, df_batch = 32, None
        
        print(f"\n#######################################################################\n #################################{YEAR}-{MONTH}################################## \n#######################################################################\n")
   
        ###############################

        # Function to read parquet files and return a dataframe
        def read_parquet_file(bucket, file_key):
            obj = S3_CLIENT.get_object(Bucket = bucket, Key = file_key)
            return pd.read_parquet(io.BytesIO(obj['Body'].read()))


#         #Return a list of all parquet files in the folder
#         def return_all_files(bucket, location):
#             my_bucket = boto3.Session().resource('s3').Bucket(bucket)
#             files_list = []
#             for file in my_bucket.objects.filter(Prefix = location):
#                 file_dict = {"bucket": file.bucket_name, "file_key": file.key}
#                 files_list.append(file_dict)
#             return files_list


        #Labels Dict
        def get_labels_dict():
            file_key = "Data/sasb_full_training.xlsx"
            obj = S3_CLIENT.get_object(Bucket = S3_BUCKET, Key=file_key)

            sasb_data = pd.read_excel(io.BytesIO(obj['Body'].read()), engine = 'openpyxl')
            sasb_data.drop(["Unnamed: 0"], axis=1, inplace=True)
            labels = sasb_data.Sector.unique()
            labels = np.delete(labels, 11)

            return {index: label for index, label in enumerate(sorted(labels))}


        #Function to clean and generate column for prediction
        def obtain_text(row):
            replace_points=['.',"/","\n","https","http",":","www","  "]
            source=str(row['sourcecommonname'])
            doc_identifier=str(row['documentidentifier'])
            themes=str(row['themes'])
            for replace_symbol in replace_points:
                source=source.replace(replace_symbol," ")
                doc_identifier=doc_identifier.replace(replace_symbol," ")

            try:
                themes_text= " ".join(re.findall(".*?theme=(.*?),",themes))
            except Exception as e:
                themes_text = ""


            pred_text=" ".join([source,doc_identifier,themes_text])
            pred_text=pred_text.replace("_"," ")

            text_list=pred_text.split(" ")
            text_list_unique=list(dict.fromkeys(text_list))
            pred_text= " ".join(text_list_unique[:200])

            return pred_text[:1000]
        
        #Assigning text labels to categorical numbers
        label_dict = get_labels_dict()
        def return_labels(label_number):
            global label_dict
            try:
                return label_dict[label_number]
            except Exception as e:
                return ""
            
        def make_predictions(df_batch, batch_size):
    
            max_count = df_batch.shape[0]
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
            predictions_list=[]  

            for i in tqdm(range(0, max_count, batch_size)):
                test_data = list(df_batch[i : i + batch_size]['pred_text'])
                predicted = [-1 for x in range(len(test_data))]
                try:
                    with torch.cuda.amp.autocast():
                        encoded_data_test = tokenizer.batch_encode_plus(
                            test_data[:batch_size],
                            add_special_tokens=True, 
                            return_attention_mask=True, 
                            padding=True, 
                            max_length=384,
                            return_tensors='pt'
                        )
                    input_ids_moderna = encoded_data_test['input_ids']
                    attention_masks_moderna = encoded_data_test['attention_mask']
                    output_moderna = model(input_ids_moderna.to(device))
                    _, predicted = torch.max(output_moderna[0], 1)
                    predicted = predicted.tolist()
                except Exception as e:
                    print(e)

                predictions_list.extend(predicted)

                if(i%(50*batch_size)==0):
                    torch.cuda.empty_cache()

            return predictions_list


        #Saving df to S3
        def save_df_to_s3(df, file_location, year, month, counter, file_format):

            assert file_format in ("csv", "parquet"), "File format must be in {csv, parquet}"

            file_name = f"{month}{year[-2:]}_{counter}.{file_format}"
            file_key = file_location + "/" + year + "/" + month + "/" + file_name
            if file_format == "csv":
                with io.StringIO() as buffer:
                    df.to_csv(buffer, index=False)
                    response = S3_CLIENT.put_object(Bucket = S3_BUCKET, Key = file_key, Body = buffer.getvalue())
            else:
                with io.BytesIO() as buffer:
                    df.to_parquet(buffer, index=False)
                    response = S3_CLIENT.put_object(Bucket = S3_BUCKET, Key = file_key, Body = buffer.getvalue())

            status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
            success = "Successful" if status == 200 else "Unsuccessful"
            print(f"\nCSV | {success} S3 put_object response. Status - {status}\n")
        
        ###############################
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False)
        model.to(device)


        load_path = "s3://zsguo/sagemaker-automated-execution/Bert_Models/second_finetuned_BERT_epoch_5.model"
        with smart_open(load_path, 'rb') as f:
            buffer = io.BytesIO(f.read())
        # model.load_state_dict(torch.load(buffer))
        model.load_state_dict(torch.load(buffer, map_location=torch.device('cpu')))
    
    
    
        ################################################
        for j in range(len(files_list)):

            for i in range(2):
                print("_______________________________________________________________________________")

            file_key = files_list[j]['file_key']
            curr_bucket = files_list[j]['bucket']


            # avoid folder itself
            if file_key == S3_LOCATION + "/":
                continue


            print(f"File: {j+1} | {file_key}")

            #Memory clear step
            del df_batch
            for _ in range(3):
                gc.collect()
                time.sleep(1)
                torch.cuda.empty_cache()

            df_batch = read_parquet_file(curr_bucket, file_key)
            print("df Shape:",df_batch.shape)
            print("-> Pre-proccesing file")
            tstart = datetime.now()
            df_batch['pred_text']=df_batch.apply(obtain_text,axis=1)
            print("Preprocessing time:", datetime.now() - tstart)
            print("-> Ready for prediction!")

            #Making predictions
            predictions_list = make_predictions(df_batch, batch_size)
            print("-> Prediction Complete. \n->Preparing to save file.")
            tstart = datetime.now()
            df_batch['SASB_Tag'] = predictions_list
            df_batch['Predicted_Sector'] = df_batch['SASB_Tag'].apply(return_labels)
            df_batch = df_batch[df_batch['SASB_Tag']!=-1].reset_index(drop=True)
            save_df_to_s3(df_batch, file_location, YEAR, MONTH, j+1, op_file_format)
            print("-> File saved! Time for saving file:", datetime.now() - tstart)
            time.sleep(5)
        


#######################################################################
 #################################2015-03################################## 
#######################################################################



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

_______________________________________________________________________________
_______________________________________________________________________________
File: 1 | ATHENA_BERT_RESULTS_AFRICA_2015_2023/2015/03/20230221_051009_00095_rxfkq_01c33171-29bb-40da-a807-449ad82462f1
df Shape: (32645, 16)
-> Pre-proccesing file
Preprocessing time: 0:01:02.907452
-> Ready for prediction!


  0%|          | 0/1021 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


-> Prediction Complete. 
->Preparing to save file.

CSV | Successful S3 put_object response. Status - 200

-> File saved! Time for saving file: 0:00:04.615143
_______________________________________________________________________________
_______________________________________________________________________________
File: 2 | ATHENA_BERT_RESULTS_AFRICA_2015_2023/2015/03/20230221_051009_00095_rxfkq_05c204c0-6c61-4d83-a464-32e2cf619519
df Shape: (33444, 16)
-> Pre-proccesing file
Preprocessing time: 0:01:02.859211
-> Ready for prediction!


  0%|          | 0/1046 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


-> Prediction Complete. 
->Preparing to save file.

CSV | Successful S3 put_object response. Status - 200

-> File saved! Time for saving file: 0:00:04.656259
_______________________________________________________________________________
_______________________________________________________________________________
File: 3 | ATHENA_BERT_RESULTS_AFRICA_2015_2023/2015/03/20230221_051009_00095_rxfkq_0d3c3d4b-5292-428a-86e7-7bee360a7b2b
df Shape: (31350, 16)
-> Pre-proccesing file
Preprocessing time: 0:01:00.606607
-> Ready for prediction!


  0%|          | 0/980 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


KeyboardInterrupt: 

In [None]:
# YEAR = "2015"
# MONTH = "03"

# #----For Reading -----
# S3_LOCATION = "ATHENA_BERT_RESULTS_AFRICA_2015_2023/" + YEAR + "/" + MONTH
# S3_BUCKET = 'sector-classification'
# S3_CLIENT = boto3.client('s3')

# #---For Saving------
# file_location = "Bert_Results_Africa_2015_2023"
# op_file_format = "parquet"

In [None]:
# S3_LOCATION

In [None]:
# files_list = return_all_files(S3_BUCKET, S3_LOCATION)

# files_list = [file for file in files_list if file['file_key'] != S3_LOCATION + "/"]     # avoid folder itself
# lenfiles_list

In [None]:
# batch_size, df_batch = 32, None

# for j in range(len(files_list)):
    
#     for i in range(2):
#         print("_______________________________________________________________________________")

#     file_key = files_list[j]['file_key']
#     curr_bucket = files_list[j]['bucket']

        
#     # avoid folder itself
#     if file_key == S3_LOCATION + "/":
#         continue
    
    
#     print(f"File: {j+1} | {file_key}")

#     #Memory clear step
#     del df_batch
#     for _ in range(3):
#         gc.collect()
#         time.sleep(1)
#         torch.cuda.empty_cache()

#     df_batch = read_parquet_file(curr_bucket, file_key)
#     print("df Shape:",df_batch.shape)
#     print("-> Pre-proccesing file")
#     tstart = datetime.now()
#     df_batch['pred_text']=df_batch.apply(obtain_text,axis=1)
#     print("Preprocessing time:", datetime.now() - tstart)
#     print("-> Ready for prediction!")
    
#     #Making predictions
#     predictions_list = make_predictions(df_batch, batch_size)
#     print("-> Prediction Complete. \n->Preparing to save file.")
#     tstart = datetime.now()
#     df_batch['SASB_Tag'] = predictions_list
#     df_batch['Predicted_Sector'] = df_batch['SASB_Tag'].apply(return_labels)
#     df_batch = df_batch[df_batch['SASB_Tag']!=-1].reset_index(drop=True)
#     save_df_to_s3(df_batch, file_location, YEAR, MONTH, j+1, op_file_format)
#     print("-> File saved! Time for saving file:", datetime.now() - tstart)
#     time.sleep(5)