In [2]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import re 

In [None]:
def remove_non_relevant_columns(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Define columns to drop
    columns_to_drop = [
        'NCT Number',
        'Study Title',
        'Study URL',
        'Brief Summary',  # Assuming we are classifying this column with BioBERT
        'Sponsor',
        'Collaborators',
        'Sex',
        'Age',
        'Locations'
    ]
    
    # Drop the non-relevant columns
    df_cleaned = df.drop(columns=columns_to_drop)
    
    return df_cleaned

def calculate_study_duration(df):
    # Convert date columns to datetime format
    df['Start Date'] = pd.to_datetime(df['Start Date'], errors='coerce')
    df['Completion Date'] = pd.to_datetime(df['Completion Date'], errors='coerce')
    
    # Calculate the duration in days
    df['Study Duration (days)'] = (df['Completion Date'] - df['Start Date']).dt.days
    
    return df

def standardize_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with a single space
        text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    return text

def standardize_text_columns(df):
    text_columns = df.select_dtypes(include=['object']).columns
    for column in text_columns:
        df[column] = df[column].apply(standardize_text)
    return df

def map_conditions(condition):
    mapping = {
        'squamous cell': ['cell lung', 'head and neck', 'squamous cell', 'small cell', 'lung', 'keratosis', 'nsclc'],
        'myeloma': ['myeloma'],
        'sarcoma': ['sarcoma'],
        'lymphoma': ['lymphoma', 'lymphoid'],
        'brain': ['brain cancer', 'glioblastoma'],
        'melanoma': ['melanoma'],
        'adeno': ['adenocarcinoma', 'prostate', 'pancreatic', 'breast cancer', 'colon'],
        'leukemia': ['leukemia'],
        'kidney': ['renal', 'kidney cancer', 'ccrcc'],
        'other': ['other']
    }
    for key, values in mapping.items():
        if any(value in condition for value in values):
            return key
    return 'other'

def apply_condition_mapping(df):
    if 'Condition' in df.columns:
        df['Condition Category'] = df['Condition'].apply(map_conditions)
    return df

def add_condition_details(df):
    conditions_map = {
        'squamous cell': {
            'survival_5yr_relative': 0.23,
            'max_treatment_duration': 4380,
            'min_treatment_duration': 30
        },
        'myeloma': {
            'survival_5yr_relative': 0.54,
            'max_treatment_duration': 3650,
            'min_treatment_duration': 30
        },
        'sarcoma': {
            'survival_5yr_relative': 0.65,
            'max_treatment_duration': 4380,
            'min_treatment_duration': 30
        },
        'lymphoma': {
            'survival_5yr_relative': 0.88,
            'max_treatment_duration': 3650,
            'min_treatment_duration': 30
        },
        'brain': {
            'survival_5yr_relative': 0.36,
            'max_treatment_duration': 2920,
            'min_treatment_duration': 14
        },
        'melanoma': {
            'survival_5yr_relative': 0.93,
            'max_treatment_duration': 3650,
            'min_treatment_duration': 14
        },
        'adeno': {
            'survival_5yr_relative': 0.89,
            'max_treatment_duration': 3650,
            'min_treatment_duration': 14
        },
        'leukemia': {
            'survival_5yr_relative': 0.63,
            'max_treatment_duration': 4380,
            'min_treatment_duration': 30
        },
        'kidney': {
            'survival_5yr_relative': 0.74,
            'max_treatment_duration': 3650,
            'min_treatment_duration': 14
        },
        'other': {
            'survival_5yr_relative': 0.68,
            'max_treatment_duration': 4320,
            'min_treatment_duration': 14
        }
    }
    
    df['Survival 5yr Relative'] = df['Condition Category'].apply(lambda x: conditions_map[x]['survival_5yr_relative'])
    df['Max Treatment Duration'] = df['Condition Category'].apply(lambda x: conditions_map[x]['max_treatment_duration'])
    df['Min Treatment Duration'] = df['Condition Category'].apply(lambda x: conditions_map[x]['min_treatment_duration'])
    
    return df

def fill_missing_other_outcome_measures(df):
    if 'Other Outcome Measures' in df.columns:
        df['Other Outcome Measures'] = df['Other Outcome Measures'].fillna('no other measure')
    return df

# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def classify_text_with_biobert(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return torch.argmax(outputs.logits, dim=1).item()

def classify_brief_summaries(df):
    if 'Brief Summary' in df.columns:
        df['Brief Summary Classification'] = df['Brief Summary'].apply(classify_text_with_biobert)
    return df

# Path to the CSV file
csv_file_path = 'C:\\Users\\Spenc\\OneDrive\\Documents\\Masters Thesis\\ctg-studies.csv'

# Remove non-relevant columns
df_cleaned = remove_non_relevant_columns(csv_file_path)

# Calculate study duration
df_with_duration = calculate_study_duration(df_cleaned)

# Drop rows where Study Duration (days) is NaN
df_with_duration = df_with_duration.dropna(subset=['Study Duration (days)'])

# Fill missing values in 'Other Outcome Measures' column
df_filled = fill_missing_other_outcome_measures(df_with_duration)

# Standardize text columns
df_standardized = standardize_text_columns(df_filled)

# Apply condition mapping
df_mapped = apply_condition_mapping(df_standardized)

# Add condition details
df_final = add_condition_details(df_mapped)

# Classify Brief Summaries with BioBERT
df_final = classify_brief_summaries(df_final)

# Save the cleaned dataframe with duration, standardized text, and condition details to a new CSV file
df_final.to_csv('/mnt/data/cleaned_ctg_studies_with_duration_standardized_mapped_with_details.csv', index=False)

# Display the cleaned DataFrame with duration, standardized text, and condition details
df_final.head()
