<a href="https://colab.research.google.com/github/SanthoshiRavi/Healthcare-Text-Processing-/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to fetch articles from PubMed
def fetch_pubmed_articles(api_key, query, max_results=100):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json&api_key={api_key}"
    response = requests.get(url)

    if 'esearchresult' in response.json():
        article_ids = response.json()['esearchresult']['idlist']
        articles = []
        for article_id in article_ids:
            details_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={article_id}&retmode=json&api_key={api_key}"
            details_response = requests.get(details_url)
            article_details = details_response.json()['result'][article_id]
            title = article_details.get('title', '')

            # Fetch the abstract separately
            abstract_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={article_id}&retmode=xml&api_key={api_key}"
            abstract_response = requests.get(abstract_url)
            abstract_xml = abstract_response.text
            print(f"Abstract XML for Article ID {article_id}: {abstract_xml[:500]}...")  # Print the first 500 chars of the abstract XML

            abstract_text = re.search(r'<AbstractText>(.*?)</AbstractText>', abstract_xml, re.DOTALL)
            abstract = abstract_text.group(1).strip() if abstract_text else 'No abstract available'

            # Debug: Print each article's details
            print(f"Article ID: {article_id}, Title: {title}, Abstract: {abstract}")
            articles.append((article_id, title, abstract))
        return articles
    else:
        print("Error: 'esearchresult' not found in the API response.")
        return []

# Example usage
api_key = "14b3330cab5a4aabaefc418267dd19492909"
query = "diabetes"
articles = fetch_pubmed_articles(api_key, query, max_results=100)

# Display the first 5 articles
print("\nFirst 5 articles:")
for idx, (article_id, title, abstract) in enumerate(articles[:5]):
    print(f"ID: {idx}, Article ID: {article_id}, Title: {title}, Abstract: {abstract[:200]}...")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Abstract XML for Article ID 38762728: <?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">38762728</PMID><DateCompleted><Year>2024</Year><Month>05</Month><Day>18</Day></DateCompleted><DateRevised><Year>2024</Year><Month>05</Month><Day>18</Day></DateRevised><Article PubModel="Electronic"><Journal>...
Article ID: 38762728, Title: Drug release profile of a novel exenatide long-term drug delivery system (OKV-119) administered to cats., Abstract: Beneficial weight-loss properties of glucagon-like peptide-1 receptor agonists (GLP-1RA) in obese people, with corresponding improvements in cardiometabolic risk factors, are well established. OKV-119 is an investigational drug delivery system that is being developed for the long-term delivery of the GLP-1RA exe

In [None]:
# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Preprocess articles
preprocessed_articles = [(preprocess_text(title), preprocess_text(abstract)) for _, title, abstract in articles]
combined_texts = [f"{title} {abstract}" for title, abstract in preprocessed_articles]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(combined_texts)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_features.toarray())

# Optionally, convert to DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df.head())


TF-IDF Matrix:
[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.        0.0838619 0.       ]
 [0.        0.        0.        ... 0.0885012 0.        0.       ]
 [0.        0.        0.        ... 0.        0.0515424 0.       ]]
   abdominal  abiesmarina  ability  abnormality  absence  abstinence  \
0        0.0          0.0      0.0          0.0      0.0         0.0   
1        0.0          0.0      0.0          0.0      0.0         0.0   
2        0.0          0.0      0.0          0.0      0.0         0.0   
3        0.0          0.0      0.0          0.0      0.0         0.0   
4        0.0          0.0      0.0          0.0      0.0         0.0   

   abstract  accumulation  accuracy  achieved  ...  woman  work  wound  \
0  0.000000           0.0       0.0       0.0  ...    0.0   0.0    0.0

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Ensure the labels match the number of samples in tfidf_features
num_samples = tfidf_features.shape[0]

# Example labels (replace with actual labels if available)
labels = ['treatment', 'diagnosis', 'prevention'] * ((num_samples // 3) + 1)
labels = labels[:num_samples]  # Ensure labels list matches the length of the data

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model performance
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")


Random Forest Classification Report:
              precision    recall  f1-score   support

   diagnosis       0.38      0.43      0.40         7
  prevention       0.00      0.00      0.00         5
   treatment       0.56      0.62      0.59         8

    accuracy                           0.40        20
   macro avg       0.31      0.35      0.33        20
weighted avg       0.35      0.40      0.38        20

Random Forest Confusion Matrix:
[[3 2 2]
 [3 0 2]
 [2 1 5]]
Random Forest Accuracy: 0.40


In [None]:
# Display the first 5 articles
print("\nFirst 5 articles:")
for idx, (article_id, title, abstract) in enumerate(articles[:5]):
    print(f"ID: {idx}, Article ID: {article_id}, Title: {title}, Abstract: {abstract[:200]}...")

# Prompt user to input the article ID for prediction
article_id_input = int(input("Enter the article number between 0 and 4 for which the label needs to be predicted: "))
selected_article = articles[article_id_input]

# Function to predict article class
def predict_article_class(title, abstract, model, vectorizer, label_encoder):
    combined_text = f"{preprocess_text(title)} {preprocess_text(abstract)}"
    tfidf_features = vectorizer.transform([combined_text])
    prediction = model.predict(tfidf_features)
    predicted_class = label_encoder.inverse_transform(prediction)
    return predicted_class[0]

# Predict the class for the selected article
selected_title, selected_abstract = selected_article[1], selected_article[2]
predicted_class = predict_article_class(selected_title, selected_abstract, rf_model, tfidf_vectorizer, label_encoder)
print(f"The predicted class for the article (ID: {article_id_input}) is: {predicted_class}")
print(f"Title: {selected_title}")
print(f"Abstract: {selected_abstract}")



First 5 articles:
ID: 0, Article ID: 38762728, Title: Drug release profile of a novel exenatide long-term drug delivery system (OKV-119) administered to cats., Abstract: Beneficial weight-loss properties of glucagon-like peptide-1 receptor agonists (GLP-1RA) in obese people, with corresponding improvements in cardiometabolic risk factors, are well established. OKV-119...
ID: 1, Article ID: 38762634, Title: The complexity of glucose time series is associated with short- and long-term mortality in critically ill adults: a multi-center, prospective, observational study., Abstract: No abstract available...
ID: 2, Article ID: 38762619, Title: An "out of the box" approach for prevention of ketoacidosis in youth with poorly controlled type 1 diabetes: combined use of insulin pump and long-acting insulin., Abstract: No abstract available...
ID: 3, Article ID: 38762618, Title: Relationship between immune cells and diabetic nephropathy: a Mendelian randomization study., Abstract: No abstract av