### Login to Hugging Face to Access the fine-tuned model

In [None]:
from huggingface_hub import login
login("hugging_face_token_holder") # The value in the parenthesis is my hugging face token 

### Importing Necessary Tools

In [None]:
# For text extraction
import requests 
import numpy as np
import pandas as pd
import re

# For deploying llm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

### For text Extraction Using API

In [None]:
# Helper functions!

def get_api_url(NCT_Number):
    return "https://clinicaltrials.gov/api/v2/studies/" + NCT_Number #NCT Number should be a str

def get_brief_title(data):
    try: 
        brief_title = data['protocolSection']['identificationModule']['briefTitle']
    except KeyError:
        return None
    else:
        return "Brief Title: " + brief_title
        
def get_official_title(data):
    try: 
        official_title = data['protocolSection']['identificationModule']['officialTitle']
    except KeyError:
        return None
    else:
        return "Official Title: " + official_title

def get_brief_summary(data):
    try: 
        brief_summary = data['protocolSection']['descriptionModule']['briefSummary']
    except KeyError:
        return None
    else:
        return "Brief Summary: " + brief_summary

def get_detailed_description(data):
    try: 
        detailed_description = data['protocolSection']['descriptionModule']['detailedDescription']
    except KeyError:
        return None
    else:
        return "Detailed Description: " + detailed_description

def get_eligibility_criteria(data):
    try: 
        eligibility_criteria = data['protocolSection']['eligibilityModule']['eligibilityCriteria']
    except KeyError:
        return None
    else:
        return "Eligibility Criteria: " +  eligibility_criteria

def get_conditions(data):
    try: 
        conditions = " ".join(data['protocolSection']['conditionsModule']['conditions'])
    except KeyError:
        return None
    else:
        return "Conditions: " + conditions

get_func_list = [get_brief_title, get_official_title, get_brief_summary, get_detailed_description, get_eligibility_criteria, get_conditions]


# Main text function
def get_text(NCT_Number):
    api_url = get_api_url(NCT_Number)
    
    try:
        response = requests.get(api_url)
        response.raise_for_status()  # Raises HTTPError if status != 200
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None
    
    try:
        data = response.json()
    except ValueError:
        print(f"Invalid JSON for NCT {NCT_Number}")
        return None
    
    full_text = ""
    for get_function in get_func_list:
        try:
            get_res = get_function(data)
            if get_res:
                full_text += get_res + "\n"
        except Exception as e:
            print(f"Error in {get_function.__name__}: {e}")
    
    return full_text if full_text.strip() else None

### Importing Model from Hugging Face

In [None]:
model_name = "xpsloan/bct-dcis-prototype-longformer-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

### Function for prediction **FROM** NCT Number

In [None]:
def predict_dcis_from_NCT(NCTNumber: str):
    text = get_text(NCTNumber)
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=4096
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
    
    id2label = {0: "Not DCIS", 1: "DCIS"}
    return id2label[pred_id]

In [None]:
def predict_dcis(text: str):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=4096
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
    
    id2label = {0: "Not DCIS", 1: "DCIS"}
    return id2label[pred_id]

### *DEMO*

In [None]:
demo_df = pd.read_csv("/kaggle/input/stagepredicteddata/Data With Stage Prediction.csv")

def get_formatted_prediction(prediction):
    list_of_predictions = re.findall(r"Stage [IV]+", prediction)
    text = "; ".join(list_of_predictions)
    return text

demo_df["Predicted Patient Stage"] = demo_df["Predicted Patient Stage"].apply(get_formatted_prediction)
demo_df = demo_df.rename(columns={"raw_text":"Raw Text"})

In [None]:
demo_df["Predicted DCIS"] = demo_df["Raw Text"].apply(predict_dcis)

In [None]:
demo_df.to_csv("/kaggle/working/my_modified_dataset_dcis.csv", index=False)