In [3]:
import pandas as pd
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def predict_extremophiles(sentences):
    # Tokenize and encode the sentences
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted probabilities for each class
    predicted_probabilities = torch.softmax(outputs.logits, dim=1)

    # Threshold probability for classification
    threshold = 0.5

    predictions = []
    # Classify sentences based on the threshold
    for i, sentence in enumerate(sentences):
        extremophile_probability = predicted_probabilities[i][1].item()
        if extremophile_probability > threshold:
            predictions.append((sentence, extremophile_probability, True))
        else:
            predictions.append((sentence, extremophile_probability, False))
    return predictions

# Fix the random seed for reproducibility
random_seed = 42
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import random
random.seed(random_seed)
os.environ['PYTHONHASHSEED'] = str(random_seed)

# Suppress warning messages
os.environ['PYTHONWARNINGS'] = "ignore"

# Load pre-trained BioBERT model and tokenizer
model_name = "monologg/biobert_v1.1_pubmed"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the CSV file
csv_path = '/kaggle/input/biobert/Task4_Secondary_Metabolites_Bacteria_Genes_Pubmed.csv'
chunk_size = 100

# Create an empty list to store all predictions
all_predictions = []

# Iterate through the CSV file in chunks of 100 rows
for df_chunk in pd.read_csv(csv_path, chunksize=chunk_size):
    # Extract the column of sentences from the current chunk
    sentences_column = df_chunk['Sentences']
    # Convert the column to a list of sentences
    sentences_list = sentences_column.tolist()
    # Predict extremophiles for the sentences
    predictions = predict_extremophiles(sentences_list)
    # Store predictions in the list
    all_predictions.extend(predictions)

# Convert predictions to DataFrame
result_df = pd.DataFrame(all_predictions, columns=['Sentences', 'Probability', 'Verify'])

# Save the results to a new CSV file
result_csv_path = '/kaggle/working/Genes_Bacteria_results.csv'
result_df.to_csv(result_csv_path, index=False)

print("Results saved to:", result_csv_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/biobert_v1.1_pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results saved to: /kaggle/working/Genes_Bacteria_results.csv
