In [21]:
import os
import pandas as pd
from docx import Document
import time
from langchain_community.chat_models import ChatOllama
from langchain.schema import SystemMessage, HumanMessage
import json
import re
import xml.etree.ElementTree as ET
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import classification_report, confusion_matrix
from concurrent.futures import ThreadPoolExecutor, as_completed

# Dataset Creation

In [2]:
def extract_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    id_match = re.search(r'<ID:(\d+)>', text)
    title_match = re.search(r'Title:\s*(.+)', text)
    abstract_match = re.search(r'Abstract:\s*(.+)', text, re.DOTALL)

    paper_id = id_match.group(1).strip() if id_match else None
    title = title_match.group(1).strip() if title_match else None
    abstract = abstract_match.group(1).strip().replace('\n', ' ') if abstract_match else None

    return paper_id, title, abstract

In [3]:
def build_dataset(cancer_dir, non_cancer_dir, output_csv="cancer_dataset.csv"):
    data = []

    for folder, label in [(cancer_dir, "Cancer"), (non_cancer_dir, "Non-cancer")]:
        for file_name in os.listdir(folder):
            if file_name.endswith(".txt"):
                file_path = os.path.join(folder, file_name)
                paper_id, title, abstract = extract_data_from_file(file_path)

                if paper_id and title and abstract:
                    data.append({
                        "id": paper_id,
                        "title": title,
                        "abstract": abstract,
                        "class": label
                    })
                else:
                    print(f"Skipped: {file_name} due to missing fields")

    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    print(f"CSV created: {output_csv} with {len(df)} records.")
    return df

In [4]:
cancer_folder = "Cancer/"
non_cancer_folder = "Non-cancer"
output_csv_path = "combined_articles.csv"

In [5]:
data = build_dataset(cancer_folder, non_cancer_folder, output_csv_path)

CSV created: combined_articles.csv with 1000 records.


In [6]:
data

Unnamed: 0,id,title,abstract,class
0,30872385,Comparison of methodologies for the detection ...,AIMS: BRAF V600E detection assists in the diag...,Cancer
1,30873683,Tumour biomarkers-Tracing the molecular functi...,"In recent years, with the increase in cancer m...",Cancer
2,30874851,"Pomalidomide, cyclophosphamide, and dexamethas...",Pomalidomide dexamethasone is a standard of ca...,Cancer
3,30875581,Aggressive variants of prostate cancer - Are w...,"Recently, adoption of novel drugs for systemic...",Cancer
4,30875950,"Circulating Tumour Cells (CTC), Head and Neck ...",Head and neck cancer is the seventh most commo...,Cancer
...,...,...,...,...
995,38623902,[Not Available].,Effective longitudinal biomarkers that track d...,Non-cancer
996,38640937,Mechanisms and management of loss of response ...,We sought to report the effectiveness of infli...,Non-cancer
997,38642556,Modification of coronary artery disease clinic...,The extent to which the relationships between ...,Non-cancer
998,38650020,Meta-analysis of the global distribution of cl...,CYP2C8 is responsible for the metabolism of 5%...,Non-cancer


# Data cleaning

In [1]:

# from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("combined_articles.csv")  # Update with your path
df = df.rename(columns={"class": "label"})  # Ensure column is named 'label'

In [3]:
df['text'] = df['title'].fillna('') + "\n\n" + df['abstract'].fillna('')

In [4]:
sampled_cancer = df[df['label'] == 'Cancer'].sample(n=5, random_state=42)
sampled_non_cancer = df[df['label'] == 'Non-cancer'].sample(n=5, random_state=42)

In [5]:
balanced_sample = pd.concat([sampled_cancer, sampled_non_cancer]).sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
print(balanced_sample['label'].value_counts())

label
Non-cancer    5
Cancer        5
Name: count, dtype: int64


# LLM Setup

In [17]:
def call_llm(text):

    """
    Uses LLM to modify the associated chunk based on the instruction.
    """
    n_ctx = 16384
    llm = ChatOllama(model="llama3.1:8b-instruct-q8_0", temperature=0.7, num_ctx=n_ctx)
    
    system_prompt = SystemMessage(content="""
    You are an expert in disease identification based on your training data. 
    Read given content and take action as per given query and apply all instructions given by user.
    If you don't know the answer, just say that you don't know.
    Strictly do not add any commentary, explanation, or additional words from your side.
    """)
    user_message=HumanMessage(content=f'''I have only two classes Cancer and Non-cancer, So for given text to you ,classify them in either of these 2 classes only, Make sure there is no variation of these 2 classes.
        Text : {text}     
        Answer : 
        ''')

    response = llm([system_prompt, user_message])

    # res_resp = restructure_llm(reference,ref_style)
    result = response.content.strip()
    # import ipdb;ipdb.set_trace()
    # print(f"LLm Response{result}")
    return result




In [124]:
def parallel_call_llm(texts, max_threads=10):
    results = [None] * len(texts)

    def task(index, text):
        result = call_llm(text)
        results[index] = result

    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(task, i, text) for i, text in enumerate(texts)]
        for _ in as_completed(futures):
            pass  # we don't need to do anything here, just wait

    return results

In [125]:
def llm_handler(df, max_threads=10):
    texts = df['text'].tolist()
    df['predicted_label'] = parallel_call_llm(texts, max_threads=max_threads)
    return df

In [126]:
def normalize_prediction(label):
    label = label.strip().lower()
    if "non-cancer" in label:
        return "Non-cancer"
    elif "cancer" in label:
        return "Cancer"
    else:
        return "Unknown"

In [None]:
# tqdm.pandas(desc="Classifying")
response_data = llm_handler(df)

In [123]:
response_data['predicted_label'] = response_data['predicted_label'].apply(normalize_prediction)


NameError: name 'response_data' is not defined

In [36]:
data['predicted_label'].value_counts()

predicted_label
Cancer        769
Non-cancer    231
Name: count, dtype: int64

# Classification Reports

In [37]:
report = classification_report(data['label'], data['predicted_label'])

In [38]:
print("\nClassification Report:\n")
print(report)


Classification Report:

              precision    recall  f1-score   support

      Cancer       0.61      0.94      0.74       500
  Non-cancer       0.87      0.40      0.55       500

    accuracy                           0.67      1000
   macro avg       0.74      0.67      0.65      1000
weighted avg       0.74      0.67      0.65      1000



In [39]:
print("Confusion Matrix:")
print(confusion_matrix(data['label'], data['predicted_label']))

Confusion Matrix:
[[471  29]
 [298 202]]


In [41]:
data.to_csv("baseline_predictions.csv", index=False)
print("\n Results saved as 'baseline_predictions.csv'")


 Results saved as 'baseline_predictions.csv'


# Explaination
### Why Llama

#### 1. Choosing LLama as its decent in identification bit, I can reason and can maintain Chain of thought to figure out best result based on context         provide.
#### 2. Low hallucinations makes it perfect for this task.

### Use of Ollama and Langchain

#### 1. Useing Ollama is absolute no-brainer for me as it can be hosted locally quite easily.

#### 2. While Huggingface provides number of benifits, due to lack of time here, exploring those and hosting on spaces was not possible here.

### Results Matrix

#### 1. From results and with simplest prompt LLM was able to detect and resport back decent results with 67% overall accuracy.

#### 2. This is done with assumption that Abstrct given and labels for them given in Problem Statement are in sync and thus are ground truth for me.

### Self-Inputs

#### 1. With little more time and some heurastical approach acuracy of LLM can be increased. 

# Fine Tuning

# Intial Thoughts
#### 1. Doing fine tuning of any llm is time consuming and needs decent infra(GPU,memory), both not available with me

#### 2. Choosing a sentence classifier might do more good than choosing llm fine tuning.(May be i am wrong here, but due to lack of time i am choosing Bert Model for fine tuning)

#### 3. Bert base uncased model is really fast in fine tuning and inference is super quick. For desease identification task this will be perfect.

In [62]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset, DatasetDict
import numpy as np
from transformers import BertForSequenceClassification

In [63]:
df = pd.read_csv("combined_articles.csv")  # Update with your path
df['text'] = df['title'].fillna('') + "\n\n" + df['abstract'].fillna('')

In [64]:
df['class'].value_counts()

class
Cancer        500
Non-cancer    500
Name: count, dtype: int64

In [65]:
df['class'] = df['class'].replace({
    'Cancer': 1,
    'Non-cancer': 0
})

  df['class'] = df['class'].replace({


In [66]:
df = df[df['class'].isin([0, 1])]

In [67]:
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['class'], random_state=42)

In [81]:
train_df = train_df.rename(columns={'class': 'labels'})
test_df = test_df.rename(columns={'class': 'labels'})

In [82]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [83]:
def tokenize(example):
    return tokenizer(example['text'], truncation=True, padding="max_length", max_length=128)

In [84]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [85]:
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
}).map(tokenize, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 700/700 [00:02<00:00, 305.29 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 309.41 examples/s]


In [86]:
dataset = dataset.remove_columns(['text', '__index_level_0__'])

In [87]:
dataset.set_format('torch')

In [88]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [89]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'precision_cancer': report['1']['precision'],
        'recall_cancer': report['1']['recall'],
        'f1_cancer': report['1']['f1-score']
    }

In [90]:
training_args = TrainingArguments(
    output_dir="./bert-results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [91]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
)

In [92]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Cancer,Recall Cancer,F1 Cancer
1,No log,0.201244,0.94,0.958333,0.92,0.938776
2,No log,0.162605,0.95,0.972028,0.926667,0.948805
3,No log,0.142985,0.966667,0.972973,0.96,0.966443


TrainOutput(global_step=132, training_loss=0.2220440199880889, metrics={'train_runtime': 37.8345, 'train_samples_per_second': 55.505, 'train_steps_per_second': 3.489, 'total_flos': 138133304064000.0, 'train_loss': 0.2220440199880889, 'epoch': 3.0})

In [93]:
predictions = trainer.predict(dataset['test'])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

In [94]:
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97       150
           1       0.97      0.96      0.97       150

    accuracy                           0.97       300
   macro avg       0.97      0.97      0.97       300
weighted avg       0.97      0.97      0.97       300


Confusion Matrix:
 [[146   4]
 [  6 144]]


# Save Model

In [95]:
trainer.save_model("bert-cancer-classifier")

In [112]:
torch.save(model.state_dict(), "bert-cancer-classifier.pth")

# Inference Pipeline

In [97]:
from transformers import pipeline

In [108]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [98]:
# classifier = pipeline("text-classification", model="bert-cancer-classifier", tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_1', 'score': 0.9974588751792908}]


In [116]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
model.load_state_dict(torch.load("bert-cancer-classifier.pth"))

  model.load_state_dict(torch.load("bert-cancer-classifier.pth"))


<All keys matched successfully>

In [118]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [119]:
def predict(text, tokenizer, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    # Tokenize and move inputs to the same device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    return "Cancer" if predicted_class == 1 else "Non-cancer"

In [120]:
text = "Pathogenic heterozygous mutations in the progranulin gene (GRN) are a key cause of frontotemporal dementia (FTD), leading to significantly reduced biofluid concentrations of the progranulin protein (PGRN). This has led to a number of ongoing therapeutic trials aiming to treat this form of FTD by increasing PGRN levels in mutation carriers. However, we currently lack a complete understanding of factors that affect PGRN levels and potential variation in measurement methods. Here, we aimed to address this gap in knowledge by systematically reviewing published literature on biofluid PGRN concentrations. Published data including biofluid PGRN concentration, age, sex, diagnosis and GRN mutation were collected for 7071 individuals from 75 publications. The majority of analyses (72%) had focused on plasma PGRN concentrations, with many of these (56%) measured with a single assay type (Adipogen) and so the influence of mutation type, age at onset, sex, and diagnosis were investigated in this subset of the data. We established a plasma PGRN concentration cut-off between pathogenic mutation carriers and non-carriers of 74.8Â ng/mL using the Adipogen assay based on 3301 individuals, with a CSF concentration cut-off of 3.43Â ng/mL. Plasma PGRN concentration varied by GRN mutation type as well as by clinical diagnosis in those without a GRN mutation. Plasma PGRN concentration was significantly higher in women than men in GRN mutation carriers (pâ€‰=â€‰0.007) with a trend in non-carriers (pâ€‰=â€‰0.062), and there was a significant but weak positive correlation with age in both GRN mutation carriers and non-carriers. No significant association was seen with weight or with TMEM106B rs1990622 genotype. However, higher plasma PGRN levels were seen in those with the GRN rs5848 CC genotype in both GRN mutation carriers and non-carriers. These results further support the usefulness of PGRN concentration for the identification of the large majority of pathogenic mutations in the GRN gene. Furthermore, these results highlight the importance of considering additional factors, such as mutation type, sex and age when interpreting PGRN concentrations. This will be particularly important as we enter the era of trials for progranulin-associated FTD."

In [121]:
print(predict(text, tokenizer, model))

Non-cancer


# Final Explainations

#### 1. I have treained on 70% dataset and kept 30% for creaying inferences.

#### 2. Looking into classification of Cancer vs Non-cancer, model aced it with 97% accuracy on limited 300 data points.