### **Loading Libraries and the Model**

In [7]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import ast
import os
from tqdm import tqdm
#set up logging
logging.basicConfig(level=logging.INFO)
#check and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
#load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)
model.eval()

cuda


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### **Looping Mechanism**

In [8]:
csv_file = 'multi_x_science/sample_test.csv'   
#read the file and display the top 5 rows
df = pd.read_csv(csv_file, delimiter=',')
df.head()

Unnamed: 0,abstracts,summary,num_abstracts,cluster_c2
0,['This paper aims at leveraging microblogs to ...,Recognizing Musical Entities is important for ...,6,0.0
1,['The accuracy of optical flow estimation algo...,Current algorithmic approaches for piecewise a...,17,0.0
2,['Cloud computing providers are now offering t...,Abstract Maximizing resource utilization by pe...,5,0.0
3,['The alignment of code in the flash memory of...,This article examines dynamic energy consumpti...,6,0.0
4,['Convolutional networks are at the core of mo...,Semantic segmentation is a challenging vision ...,4,1.0


In [9]:
csv_file = 'multi_x_science/sample_test.csv'
cluster_name = 'cluster_c3'
#checking if the CSV file already has a 'cluster' column; if not, we create it
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
    if cluster_name not in df.columns:
        df[cluster_name] = np.nan
else:
    df = pd.DataFrame(columns=['abstracts', cluster_name])
    
#find the index where clustering starts or resumes
start_index = df[df[cluster_name].isnull()].index.min()
start_index = start_index if not np.isnan(start_index) else 0

In [10]:
def check_eval_list(eval_list, cluster_indices):
        for i in range(len(eval_list)):
            if set(eval_list[i]) != set(cluster_indices[i]):
                return 0
        return 1

In [11]:
#iterate through each row in the CSV, starting or resuming from 'start_index'
i=0
for index, row in tqdm(df.iloc[start_index:].iterrows(), total=len(df) - start_index, position=start_index):
    num_clusters = 3
    text_list = ast.literal_eval(row['abstracts'])
    text_list_len = len(text_list)
    
    #we create set of eval_lists for comparison and to generate results
    eval_list = [[] for _ in range(num_clusters)]
    if num_clusters == 2:
        eval_list[1] = [text_list_len-2, text_list_len-1]
        eval_list[0] = list(range(text_list_len-2))
    elif num_clusters == 3:
        eval_list[2] = [text_list_len-1]
        eval_list[1] = [text_list_len-2]
        eval_list[0] = list(range(text_list_len-2))
    # print(eval_list)
    embeddings = []
    cluster_labels = []
    cluster_indices = [[] for _ in range(num_clusters)]
    for text in text_list:
        tokenized_text = tokenizer(text, truncation=True, padding="max_length", max_length=500, return_tensors="pt")
        tokenized_text = tokenized_text.to(device)
        with torch.no_grad():
            outputs = model(**tokenized_text)
        embeddings.append(outputs.last_hidden_state[0].mean(dim=0).cpu().numpy())
    #applying clustering using KMeans
    kmeans = KMeans(n_clusters=num_clusters, n_init='auto')
    cluster_labels = kmeans.fit_predict(embeddings)
    for i,label in enumerate(cluster_labels):
        cluster_indices[label].append(i)
    # print(cluster_indices)
    # print(check_eval_list(eval_list, cluster_indices))
    # break
    #check if eval_list and cluster_indices have same list of indices in each cluster and return 1 if true
    if check_eval_list(eval_list, cluster_indices):
        df.at[index, cluster_name] = 1
    else:
        df.at[index, cluster_name] = 0
    df.to_csv(csv_file, index=False)


  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values = values.astype(str)
  values =

In [12]:
csv_file = 'multi_x_science/sample_test.csv'   
#read the file and display the top 5 rows
df = pd.read_csv(csv_file, delimiter=',')
df.head()

Unnamed: 0,abstracts,summary,num_abstracts,cluster_c2,cluster_c3
0,['This paper aims at leveraging microblogs to ...,Recognizing Musical Entities is important for ...,6,0.0,0.0
1,['The accuracy of optical flow estimation algo...,Current algorithmic approaches for piecewise a...,17,0.0,0.0
2,['Cloud computing providers are now offering t...,Abstract Maximizing resource utilization by pe...,5,0.0,0.0
3,['The alignment of code in the flash memory of...,This article examines dynamic energy consumpti...,6,0.0,0.0
4,['Convolutional networks are at the core of mo...,Semantic segmentation is a challenging vision ...,4,1.0,0.0
