In [None]:
# Imports
import pandas as pd
import nltk
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


In [None]:
# Load dataset
df = pd.read_csv('clinicaltrials_dataset.csv')

# Data Preprocess
print('is null ? : \n', df.isnull().sum())
print('\n is duplicated :  ', df.duplicated().sum())
print('\ndata shape :', df.shape)

In [None]:
df.drop_duplicates(inplace=True)
df.drop(columns=['condition'], inplace=True)
df = df.dropna()

In [None]:
# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Preprocessing functions
def preprocess_text(text, apply_remove_p=True):
    text = re.sub(
        r'(https?://|ftp://|tinyurl\.com/)\S+(\s|$)|(([a-zA-Z0-9-]+\.)+(com|org|net|gov|edu|co\.uk|co\.in|co\.[a-z]+))',
        ' ', text)
    text = re.sub(r'(https?:\/\/)?([a-zA-Z0-9-]+\.)+(com|org|net|gov|edu|co\.uk|co\.in|co\.[a-z]+)', '', text)
    text = re.sub(r'(\w)\1+', r'\1', text)
    text = re.sub(r'(.)\1+', r'\1', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    if apply_remove_p:
        text = re.sub(r'(\w)(\W+)(\w)', lambda match: match.group(1) + ' ' + match.group(3).lower(), text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~|\•|\)|\("""), '  ', text)
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [None]:
df.loc[:, 'title'] = df['title'].apply(preprocess_text)
df.loc[:, 'summary'] = df['summary'].apply(preprocess_text)
df.loc[:, 'detailed_description'] = df['detailed_description'].apply(preprocess_text)
df.loc[:, 'eligibility'] = df['eligibility'].apply(preprocess_text)

In [None]:
# Save the preprocessed dataset
df.to_csv('clinicaltrials_dataset_cleaned.csv', index=False)

In [None]:
# TF-IDF

# Load the preprocessed data
csv_file_path = 'argsme_dataset_cleaned.csv'
df = pd.read_csv(csv_file_path)

# Combine the text from 'premises_texts', 'conclusion', 'source_title', and 'topic' columns
df['combined_text'] = df['title'].astype(str) + ' ' + df['summary'].astype(str) + ' ' + df[
    'detailed_description'].astype(str) + ' ' + df['eligibility'].astype(str)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the combined text
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Save the TF-IDF model and matrix
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')

In [None]:
# Clustering

