In [334]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer # Tokenisation
from transformers import AutoModelForSequenceClassification # Classification
from transformers import TrainingArguments 
from transformers import Trainer
from sklearn.preprocessing import LabelEncoder

In [250]:
clinical_trial = pd.read_csv('clinical_trial.csv')
user_ratings = pd.read_csv('user_ratings.csv')
user_ratings.columns


Index(['Condition', 'ID', 'Age', 'Sex', 'rating', 'location'], dtype='object')

In [251]:
clinical_trial.shape

(1028, 11)

In [252]:
ratings_with_clinical = user_ratings.merge(clinical_trial, on='Condition')

In [311]:
ratings_with_clinical.columns

Index(['Condition', 'ID', 'Age', 'Sex', 'rating', 'location', 'Rank', 'NCTId',
       'BriefTitle', 'EligibilityCriteria', 'DetailedDescription', 'Keyword',
       'OverallStatus', 'LocationCity', 'StartDate', 'SeeAlsoLinkURL'],
      dtype='object')

In [257]:
ratings_with_clinical.shape

(1722622, 16)

In [258]:
pt = ratings_with_clinical.pivot_table(index='BriefTitle',columns='ID',values='rating')

In [259]:
pt.fillna(0,inplace=True)

In [260]:
pt

ID,1,2,3,4,5,6,7,8,9,10,...,4991,4992,4993,4994,4995,4996,4997,4998,4999,5000
BriefTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Manual Dexterity and Oculomotor Control in Schizophrenia""",0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15 Month Study for Adults Who Have Been Diagnosed With Schizophrenia and Incarcerated,0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20101299: Study to Evaluate the Effect of AMG 747 on Schizophrenia Negative Symptoms,0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20110165: Study to Evaluate the Effect of AMG 747 on Schizophrenia Negative Symptoms (Study 165),0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"24-Week Open Label Extension to a Randomized, 6-Week Double Blind, Placebo Controlled Study, to Evaluate the Safety and Tolerability of Flexible Doses of Extended Release OROS® Paliperidone in the Treatment of Geriatric Subjects With Schizophrenia",0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rTMS Treatment for Positive and Negative Symptoms of Schizophrenia,0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rTMS for Adults With Autistic Spectrum Disorder,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,5.0,3.0
rTMS on Appetite and Cognitive Function in Schizophrenia,0.0,0.0,3.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the Activating Consciousness Technique (Hypnosis) in General Practice,0.0,2.0,0.0,0.0,2.0,0.0,3.0,0.0,4.0,1.0,...,1.0,0.0,0.0,0.0,2.0,0.0,4.0,3.0,0.0,0.0


In [335]:
#Calculating similarity scores for pt  
similarity_scores = cosine_similarity(pt)

#Label encoding is done to make a .csv that contains condition in numberic that can be used for classification and to train the transformer.

labels = list(clinical_trial['Condition'])

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the labels and transform them to numeric values
clinical_trial['numeric_condition'] = label_encoder.fit_transform(labels)

# Print the numeric labels
print(clinical_trial.columns)

clinical_trial['concatenated'] = clinical_trial[['BriefTitle', 'DetailedDescription', 'Keyword']].apply(lambda x: ', '.join(x.astype(str)), axis=1)


Index(['Rank', 'NCTId', 'BriefTitle', 'Condition', 'EligibilityCriteria',
       'DetailedDescription', 'Keyword', 'OverallStatus', 'LocationCity',
       'StartDate', 'SeeAlsoLinkURL', 'numeric_condition', 'concatenated'],
      dtype='object')


In [330]:



def recommend(input):

    # Defining the model path and load tokenizer and Bert-based model which is trained in code_bert.ipynb
    model_path = "/Users/ankit/Downloads/Ideathon/trial_Recommender"
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Tokenizing the input text
    input_text = input
    tokenized_text = tokenizer(input_text,
                           truncation=True,
                           is_split_into_words=False,
                           return_tensors='pt')

    # Passing the tokenized input to the fine-tuned Bert-based model and get predicted label
    #Note Predicted value is 0 and 1

    outputs = model(**tokenized_text)
    predicted_label = outputs.logits.argmax(-1)
    predicted_label_value = predicted_label.item()

    # Maping numeric predicted label to condition name

    if predicted_label_value == 0:
        condition_name = "Anxiety"
    elif predicted_label_value == 1:
        condition_name = "Autism"
    elif predicted_label_value == 2:
        condition_name = "Schizophrenia"
    else:
        condition_name = "Unknown"

    


    # Filter trials based on condition name and get location or index of trial name of the matched condition_name.

    name_filter = ratings_with_clinical['Condition'] == condition_name
    trial_value = ratings_with_clinical.loc[name_filter, 'BriefTitle'].values[0]
    
    # index fetch
    index = np.where(pt.index==trial_value)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:6]
    
    data = []
    # Iterate over similar items and gather other information matching the BriefTitle.
    for i in similar_items:
        item = []
        temp_df = ratings_with_clinical[ratings_with_clinical['BriefTitle'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('BriefTitle')['BriefTitle'].values))
        item.extend(list(temp_df.drop_duplicates('BriefTitle')['DetailedDescription'].values))
        item.extend(list(temp_df.drop_duplicates('BriefTitle')['Keyword'].values))
        item.extend(list(temp_df.drop_duplicates('BriefTitle')['StartDate'].values))
        
        data.append(item)
    
    return data

In [332]:
#Testing the recommend function

recommend('I am having Schizophrenia issues')

[['A North-American Eight-week Study to Evaluate the Efficacy and Safety of Saredutant in Patients With Generalized Anxiety Disorder',
  '0',
  'Anxiety|clinical trials',
  'October 2006'],
 ['A Pilot RCT on the Efficacy of TranS-C Intervention on Anxiety Symptoms',
  '0',
  'Anxiety|Insomnia|Transdiagnostic Sleep and Circadian Treatment|randomised controlled trial|TranS-C|Transdiagnostic Sleep intervention',
  'September 1, 2022'],
 ['A Pilot Study to Determine a Candidate Protocol for Transcranial Electrical Stimulation in the Treatment of Anxiety',
  'The investigators examine the efficacy of two different transcranial direct current stimulation (tDCS) stimulation protocols in the treatment of anxiety in an open-label pilot study in patients with chronic pain. The investigators also seek to detect autonomic nervous system changes induced by the tDCS, and develop new methods for the measurement of autonomic nervous system functions.',
  '0',
  'December 2014'],
 ['A Pragmatic Trial o

In [77]:
import pickle

In [336]:
#pickle is used to make a .pkl file that can be used to call these file in the fron-end using flask application

pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(ratings_with_clinical,open('ratings_with_clinical.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))
pickle.dump(clinical_trial,open('clinical_trial.pkl','wb'))

OSError: [Errno 28] No space left on device

Index(['Rank', 'NCTId', 'BriefTitle', 'Condition', 'EligibilityCriteria',
       'DetailedDescription', 'Keyword', 'OverallStatus', 'LocationCity',
       'StartDate', 'SeeAlsoLinkURL', 'numeric_condition'],
      dtype='object')


In [268]:
clinical_trial.shape

(1028, 13)

In [269]:
# Select two columns and save as CSV
selected_columns = clinical_trial[['concatenated', 'numeric_condition']]
selected_columns.to_csv('selected_columns.csv', index=False)

In [271]:
#'selected_columns.csv' this csv file is created for the training of the Bert-based model.
selected_columns.columns

Index(['concatenated', 'numeric_condition'], dtype='object')