In [1]:
!pip install transformers datasets torch scikit-learn



In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
# Load the CSV file into a DataFrame with explicit encoding
df = pd.read_csv('/Users/cherishkohli/new_BERT_training_dataset.csv', encoding='ISO-8859-1')

# Display the first few rows to verify the data
df.head()


Unnamed: 0,\nProject Title,Project Description,Major
0,Dynamics of Human Trust of AI,The purpose of this research is to investigate...,Data Science
1,AI-Powered Avatar Generation,Research Problem : Creating lifelike avatars t...,"Data Science, Computer Science"
2,Explainable Hospital Readmission Prediction wi...,The Australian Commission on Safety and Qualit...,"Data Science, Computer Science"
3,Bushfire Path prediction,"Enhance the FirePath platform, built by OreFox...",Data Science
4,Identification of Porphyry Deposits from Magne...,Research Problem \nPorphyry deposits are signi...,Data Science


In [5]:
# Replace newline characters with spaces in the 'Project Description' column
df['Project Description'] = df['Project Description'].replace('\n', ' ', regex=True)

# Verify the data after cleaning
df.head()


Unnamed: 0,\nProject Title,Project Description,Major
0,Dynamics of Human Trust of AI,The purpose of this research is to investigate...,Data Science
1,AI-Powered Avatar Generation,Research Problem : Creating lifelike avatars t...,"Data Science, Computer Science"
2,Explainable Hospital Readmission Prediction wi...,The Australian Commission on Safety and Qualit...,"Data Science, Computer Science"
3,Bushfire Path prediction,"Enhance the FirePath platform, built by OreFox...",Data Science
4,Identification of Porphyry Deposits from Magne...,Research Problem Porphyry deposits are signif...,Data Science


In [6]:
# Strip any leading/trailing whitespace or newlines from the column headers
df.columns = df.columns.str.strip()

# Verify the column headers to ensure they are cleaned
print(df.columns)


Index(['Project Title', 'Project Description', 'Major'], dtype='object')


In [7]:
df.head()

Unnamed: 0,Project Title,Project Description,Major
0,Dynamics of Human Trust of AI,The purpose of this research is to investigate...,Data Science
1,AI-Powered Avatar Generation,Research Problem : Creating lifelike avatars t...,"Data Science, Computer Science"
2,Explainable Hospital Readmission Prediction wi...,The Australian Commission on Safety and Qualit...,"Data Science, Computer Science"
3,Bushfire Path prediction,"Enhance the FirePath platform, built by OreFox...",Data Science
4,Identification of Porphyry Deposits from Magne...,Research Problem Porphyry deposits are signif...,Data Science


In [8]:
# Separate the features (project descriptions) and labels (majors)
project_descriptions = df['Project Description'].values
majors = df['Major'].values

# Display a sample to verify
print(f"Sample Project Description: {project_descriptions[0]}")
print(f"Corresponding Major(s): {majors[0]}")


Sample Project Description: The purpose of this research is to investigate the development of trust when interacting with an artificial intelligence (AI) system, to better understand  cognitive activity of humans while interacting with AI systems.     Findings of this study will improve our understanding of the various  dimensions of trust, and the ways in which trust dynamically develops in  interactions, such as human-AI systems. This will provide insights to improve  human-AI performance and efficiency in joint analytic tasks, such as  recommender systems and human-AI teaming.  As your project will contribute to an ongoing study into dynamic cognitive  trust in AI, using quantum-like models, you are expected to focus on the  following issues:  -investigate the literature on human trust in AI and its dimensions  (such as reliability and benevolence);  -work with the project supervisor to design an experiment and online  questionnaire that collects data about an aspect of human trust 

In [9]:
# Import the BERT tokenizer from Hugging Face
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the project descriptions
tokenized_descriptions = [tokenizer.encode(desc, add_special_tokens=True, max_length=512, truncation=True) for desc in project_descriptions]

# Check the first tokenized project description
print(f"First tokenized description: {tokenized_descriptions[0]}")


First tokenized description: [101, 1996, 3800, 1997, 2023, 2470, 2003, 2000, 8556, 1996, 2458, 1997, 3404, 2043, 21935, 2007, 2019, 7976, 4454, 1006, 9932, 1007, 2291, 1010, 2000, 2488, 3305, 10699, 4023, 1997, 4286, 2096, 21935, 2007, 9932, 3001, 1012, 9556, 1997, 2023, 2817, 2097, 5335, 2256, 4824, 1997, 1996, 2536, 9646, 1997, 3404, 1010, 1998, 1996, 3971, 1999, 2029, 3404, 8790, 3973, 11791, 1999, 10266, 1010, 2107, 2004, 2529, 1011, 9932, 3001, 1012, 2023, 2097, 3073, 20062, 2000, 5335, 2529, 1011, 9932, 2836, 1998, 8122, 1999, 4101, 23521, 8518, 1010, 2107, 2004, 16755, 2121, 3001, 1998, 2529, 1011, 9932, 27025, 1012, 2004, 2115, 2622, 2097, 9002, 2000, 2019, 7552, 2817, 2046, 8790, 10699, 3404, 1999, 9932, 1010, 2478, 8559, 1011, 2066, 4275, 1010, 2017, 2024, 3517, 2000, 3579, 2006, 1996, 2206, 3314, 1024, 1011, 8556, 1996, 3906, 2006, 2529, 3404, 1999, 9932, 1998, 2049, 9646, 1006, 2107, 2004, 15258, 1998, 3841, 6777, 9890, 5897, 1007, 1025, 1011, 2147, 2007, 1996, 2622, 12366,



In [11]:
import torch

# Pad all tokenized sequences to the same length (BERT's maximum length is 512 tokens)
MAX_LEN = 512
padded_descriptions = torch.nn.utils.rnn.pad_sequence([torch.tensor(t) for t in tokenized_descriptions], 
                                                      batch_first=True, 
                                                      padding_value=0)

# Check the first padded sequence
print(f"First padded description: {padded_descriptions[0]}")


First padded description: tensor([  101,  1996,  3800,  1997,  2023,  2470,  2003,  2000,  8556,  1996,
         2458,  1997,  3404,  2043, 21935,  2007,  2019,  7976,  4454,  1006,
         9932,  1007,  2291,  1010,  2000,  2488,  3305, 10699,  4023,  1997,
         4286,  2096, 21935,  2007,  9932,  3001,  1012,  9556,  1997,  2023,
         2817,  2097,  5335,  2256,  4824,  1997,  1996,  2536,  9646,  1997,
         3404,  1010,  1998,  1996,  3971,  1999,  2029,  3404,  8790,  3973,
        11791,  1999, 10266,  1010,  2107,  2004,  2529,  1011,  9932,  3001,
         1012,  2023,  2097,  3073, 20062,  2000,  5335,  2529,  1011,  9932,
         2836,  1998,  8122,  1999,  4101, 23521,  8518,  1010,  2107,  2004,
        16755,  2121,  3001,  1998,  2529,  1011,  9932, 27025,  1012,  2004,
         2115,  2622,  2097,  9002,  2000,  2019,  7552,  2817,  2046,  8790,
        10699,  3404,  1999,  9932,  1010,  2478,  8559,  1011,  2066,  4275,
         1010,  2017,  2024,  3517,  2

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert the "Major" column into a list of lists (since some projects have multiple majors)
majors = df['Major'].apply(lambda x: x.split(', '))

# Use MultiLabelBinarizer to encode the majors into binary labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(majors)

# Print the binary labels for the first project
print(f"First project binary labels: {binary_labels[0]}")
print(f"Classes: {mlb.classes_}")


First project binary labels: [0 0 1 0]
Classes: ['Computer Science' 'Cyber Security' 'Data Science' 'Software Development']


In [16]:
# Define input_ids from padded_descriptions (it's already a tensor)
input_ids = padded_descriptions

# Ensure the input_ids have been properly set
print(f"First input id set: {input_ids[0]}")


First input id set: tensor([  101,  1996,  3800,  1997,  2023,  2470,  2003,  2000,  8556,  1996,
         2458,  1997,  3404,  2043, 21935,  2007,  2019,  7976,  4454,  1006,
         9932,  1007,  2291,  1010,  2000,  2488,  3305, 10699,  4023,  1997,
         4286,  2096, 21935,  2007,  9932,  3001,  1012,  9556,  1997,  2023,
         2817,  2097,  5335,  2256,  4824,  1997,  1996,  2536,  9646,  1997,
         3404,  1010,  1998,  1996,  3971,  1999,  2029,  3404,  8790,  3973,
        11791,  1999, 10266,  1010,  2107,  2004,  2529,  1011,  9932,  3001,
         1012,  2023,  2097,  3073, 20062,  2000,  5335,  2529,  1011,  9932,
         2836,  1998,  8122,  1999,  4101, 23521,  8518,  1010,  2107,  2004,
        16755,  2121,  3001,  1998,  2529,  1011,  9932, 27025,  1012,  2004,
         2115,  2622,  2097,  9002,  2000,  2019,  7552,  2817,  2046,  8790,
        10699,  3404,  1999,  9932,  1010,  2478,  8559,  1011,  2066,  4275,
         1010,  2017,  2024,  3517,  2000,  

In [17]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% training, 20% validation)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, binary_labels, test_size=0.2, random_state=42
)

# Print the size of the training and validation sets
print(f"Training set size: {len(train_inputs)}")
print(f"Validation set size: {len(val_inputs)}")


Training set size: 25
Validation set size: 7


In [18]:
# Create attention masks: 1 for real tokens, 0 for padding tokens
attention_masks = [[int(token_id > 0) for token_id in input_id] for input_id in input_ids]

# Split the attention masks as well into training and validation sets
train_masks, val_masks = train_test_split(attention_masks, test_size=0.2, random_state=42)

# Print the first training mask to verify
print(f"First training mask: {train_masks[0]}")


First training mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [19]:
# Convert all inputs, labels, and masks into torch tensors
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

# Check the tensor sizes
print(f"Training input tensor shape: {train_inputs.shape}")
print(f"Validation input tensor shape: {val_inputs.shape}")


Training input tensor shape: torch.Size([25, 512])
Validation input tensor shape: torch.Size([7, 512])


  train_inputs = torch.tensor(train_inputs)
  val_inputs = torch.tensor(val_inputs)


In [20]:
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

# Combine the training inputs, masks, and labels into a TensorDataset
train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

# Use DataLoader to handle batches of data during training
batch_size = 8

# Create DataLoaders for the training and validation datasets
train_dataloader = DataLoader(
    train_data,  # Training data
    sampler=RandomSampler(train_data),  # Random sampling for training
    batch_size=batch_size  # Batch size
)

val_dataloader = DataLoader(
    val_data,  # Validation data
    sampler=SequentialSampler(val_data),  # Sequential sampling for validation
    batch_size=batch_size  # Batch size
)

# Print the first batch of training data to verify
for batch in train_dataloader:
    print(batch)
    break  # Just print the first batch


[tensor([[ 101, 2195, 2330,  ...,    0,    0,    0],
        [ 101, 2023, 2470,  ...,    0,    0,    0],
        [ 101, 2470, 3291,  ...,    0,    0,    0],
        ...,
        [ 101, 3151, 3274,  ...,    0,    0,    0],
        [ 101, 2470, 3291,  ...,    0,    0,    0],
        [ 101, 2715, 4773,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([[0, 1, 0, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0]])]


In [22]:
# Function to convert inputs and labels into a format Hugging Face Trainer expects
def convert_to_dataset(input_ids, attention_masks, labels):
    return [{'input_ids': input_id, 'attention_mask': attention_mask, 'labels': label}
            for input_id, attention_mask, label in zip(input_ids, attention_masks, labels)]

# Convert the training and validation data to the expected format
train_data = convert_to_dataset(train_inputs, train_masks, train_labels)
val_data = convert_to_dataset(val_inputs, val_masks, val_labels)

# Verify the format of the training data
print(f"First training example: {train_data[0]}")


First training example: {'input_ids': tensor([  101,  2195,  2330,  1011,  3120,  3934,  3298,  2715,  1011,  2154,
         2009,  5097,  1012,  2174,  1010,  2070,  2330,  1011,  3120,  3934,
         2131, 20419,  2011, 24391, 17857,  1010,  2040,  2421, 15451,  8059,
         2000,  1996,  3642,  2000, 12014,  1996,  3036,  1997,  1996,  4646,
         5198,  1012,  2023,  2622,  2097,  8556,  8107,  2005, 12329,  1996,
         2330,  1011,  3120,  4007,  1012,  2057,  2031,  2019,  1999,  1011,
         2160,  6994,  2000, 20302, 23274, 21025,  2705, 12083,  2330,  1011,
         3120,  4007,  1012,  2017,  2024,  3517,  2000,  3579,  2006,  1996,
         2206,  3314,  1024,  1011,  8556,  1996,  3906,  2006,  2330,  1011,
         3120,  4007,  1998,  2049,  3036,  1011,  2147,  2007,  1996,  2622,
        12366,  2000,  4503,  1037,  2640,  2005, 12329,  1996,  2330,  1011,
         3120,  4007,  1012,  2057,  2933,  2000,  4339,  1037,  2470,  3720,
         7851,  1996,  955

In [24]:
# Ensure the model is set for multi-label classification
model.config.problem_type = "multi_label_classification"


In [25]:
# Multi-label classification using BCEWithLogitsLoss
from torch.nn import BCEWithLogitsLoss

def compute_loss(model, inputs):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = BCEWithLogitsLoss()
    loss = loss_fct(logits, labels.float())
    return loss


In [27]:
from transformers import Trainer
from torch.nn import BCEWithLogitsLoss

# Custom Trainer with overridden compute_loss for multi-label classification
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())  # Make sure labels are float for BCEWithLogitsLoss
        return (loss, outputs) if return_outputs else loss

# Initialize the CustomTrainer with the correct parameters
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Add the data collator
)

# Start the training process
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=12, training_loss=0.6879624327023824, metrics={'train_runtime': 18.6871, 'train_samples_per_second': 4.013, 'train_steps_per_second': 0.642, 'total_flos': 19733683507200.0, 'train_loss': 0.6879624327023824, 'epoch': 3.0})

In [31]:
import torch

# Check if MPS (Apple Silicon) is available, otherwise use CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Move the model to the appropriate device
model.to(device)

# Example project description with triple quotes for multi-line string
project_description = """
Research Problem
Porphyry deposits are significant sources of copper, gold, and other metals, 
and their identification is crucial for mineral exploration. Traditional methods
of detecting these deposits from magnetic data are often labor-intensive and
subject to human error. This research aims to leverage object detection
techniques to automate and improve the accuracy of identifying porphyry
deposits from magnetic property images.

Aims
1. Develop a Methodology: Establish a systematic approach for
analyzing magnetic property images to identify porphyry deposits.
2. Implement Object Detection Algorithms: Apply and optimize object
detection models to recognize geological features indicative of
porphyry deposits.
3. Evaluate Model Performance: Assess the accuracy and efficiency of
different object detection algorithms in identifying porphyry deposits.

Method
1. Understanding Magnetic Properties: Identify magnetic
characteristics associated with porphyry deposits and study their
regional variations.
2. Data Acquisition and Preprocessing: Source high-quality magnetic
property images and perform necessary preprocessing to clean and
enhance these images for analysis.
3. Feature Extraction: Extract and quantify key features or patterns in
the magnetic property images that indicate the presence of porphyry
deposits.
4. Object Detection Algorithms: Test and optimize object detection
algorithms (e.g., YOLO, Faster R-CNN) for identifying geological
features.
5. Model Training and Validation: Split data into training, validation,
and test sets, and ensure robust model training and validation.
6. Model Performance Evaluation: Use metrics such as precision, recall,
F1 score, and Intersection over Union to evaluate model
performance.
7. Challenges and Limitations: Identify main challenges and address
them through methodological or technological improvements.
8. Practical Applications: Integrate the developed methodology into
existing geological exploration workflows and highlight potential
benefits.

Expected Outputs
1. A robust methodology for analyzing magnetic property images to
identify porphyry deposits.
2. Optimized object detection algorithms tailored for geological feature
identification.
3. Comprehensive performance evaluation of various object detection
models.
4. Practical guidelines for integrating the methodology into geological
exploration workflows.
5. Identification of challenges and proposed solutions for future
research directions.
"""

# Tokenize and prepare input
inputs = tokenizer(project_description, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

# Move inputs to the correct device
inputs = {key: val.to(device) for key, val in inputs.items()}

# Model prediction
outputs = model(**inputs)
logits = outputs.logits

# Apply sigmoid to convert logits to probabilities
probabilities = torch.sigmoid(logits)

# Define threshold (0.5 is common)
predicted_labels = (probabilities > 0.5).int()

# Print predicted majors
predicted_majors = mlb.inverse_transform(predicted_labels.detach().cpu().numpy())  # Move to CPU if necessary
print(f"Predicted majors: {predicted_majors}")


Predicted majors: [('Data Science', 'Software Development')]
