<a href="https://colab.research.google.com/github/SanthoshiRavi/Healthcare-Text-Processing-/blob/main/Gradient_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython


Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
from Bio import Entrez
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to fetch articles from PubMed using Biopython
def fetch_pubmed_articles(api_key, query, max_results=100):
    Entrez.email = "santhohope@gmail.com"  # Replace with your actual email address
    Entrez.api_key = "14b3330cab5a4aabaefc418267dd19492909"  # Replace with your actual API key

    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    article_ids = record['IdList']
    articles = []

    for article_id in article_ids:
        handle = Entrez.efetch(db="pubmed", id=article_id, rettype="abstract", retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        try:
            pubmed_article = records['PubmedArticle'][0]
            title = pubmed_article['MedlineCitation']['Article']['ArticleTitle']
            abstract = pubmed_article['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in pubmed_article['MedlineCitation']['Article'] else 'No abstract available'
        except (KeyError, IndexError) as e:
            print(f"Error: {e} for Article ID {article_id}")
            title = 'No title available'
            abstract = 'No abstract available'

        articles.append((article_id, title, abstract))

    return articles

# Example usage
api_key = "14b3330cab5a4aabaefc418267dd19492909"
query = "diabetes"
articles = fetch_pubmed_articles(api_key, query, max_results=100)

# Display the first 5 articles
print("\nFirst 5 articles:")
for idx, (article_id, title, abstract) in enumerate(articles[:5]):
    print(f"ID: {idx}, Article ID: {article_id}, Title: {title}, Abstract: {abstract[:200]}...")

# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Preprocess articles
preprocessed_articles = [(preprocess_text(title), preprocess_text(abstract)) for _, title, abstract in articles]
combined_texts = [f"{title} {abstract}" for title, abstract in preprocessed_articles]

print("\nSample Preprocessed Texts:")
for idx, text in enumerate(combined_texts[:5]):
    print(f"ID: {idx}, Text: {text[:200]}...")

# Create TF-IDF vectorizer with n-grams
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(combined_texts)

# Print the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(tfidf_features.toarray())

# Optionally, convert to DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(tfidf_df.head())

# Ensure the labels match the number of samples in tfidf_features
num_samples = tfidf_features.shape[0]

# Example labels (replace with actual labels if available)
labels = ['treatment', 'diagnosis', 'prevention'] * ((num_samples // 3) + 1)
labels = labels[:num_samples]  # Ensure labels list matches the length of the data

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Apply SMOTE to the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(tfidf_features, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=200, random_state=42)
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the Gradient Boosting model performance
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=label_encoder.classes_))

# Compute the macro-averaged F1-score
macro_f1 = f1_score(y_test, y_pred_gb, average='macro')
print(f"\nMacro-Averaged F1-Score: {macro_f1:.2f}")

# Confusion Matrix
print("\nGradient Boosting Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

# Accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"\nGradient Boosting Accuracy: {accuracy_gb:.2f}")

# Function to predict article class
def predict_article_class(title, abstract, model, vectorizer, label_encoder):
    combined_text = f"{preprocess_text(title)} {preprocess_text(abstract)}"
    tfidf_features = vectorizer.transform([combined_text])
    prediction = model.predict(tfidf_features)
    predicted_class = label_encoder.inverse_transform(prediction)
    return predicted_class[0]

# Continuous prediction loop
while True:
    # Display the first 5 articles
    print("\nFirst 5 articles:")
    for idx, (article_id, title, abstract) in enumerate(articles[:5]):
        print(f"ID: {idx}, Article ID: {article_id}, Title: {title}, Abstract: {abstract[:200]}...")

    # Prompt user to input the article ID for prediction
    article_id_input = input("\nEnter the article number between 0 and 4 for which the label needs to be predicted (or 'exit' to quit): ")
    if article_id_input.lower() == 'exit':
        break

    try:
        article_id_input = int(article_id_input)
        selected_article = articles[article_id_input]
        selected_title, selected_abstract = selected_article[1], selected_article[2]
        predicted_class = predict_article_class(selected_title, selected_abstract, gb_model, tfidf_vectorizer, label_encoder)
        print(f"\nThe predicted class for the article (ID: {article_id_input}) is: {predicted_class}")
        print(f"Title: {selected_title}")
        print(f"Abstract: {selected_abstract}")
        print(f"Current Model Accuracy: {accuracy_gb:.2f}")
    except (ValueError, IndexError):
        print("Invalid input. Please enter a valid article number or 'exit' to quit.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



First 5 articles:
ID: 0, Article ID: 38762728, Title: Drug release profile of a novel exenatide long-term drug delivery system (OKV-119) administered to cats., Abstract: Beneficial weight-loss properties of glucagon-like peptide-1 receptor agonists (GLP-1RA) in obese people, with corresponding improvements in cardiometabolic risk factors, are well established. OKV-119...
ID: 1, Article ID: 38762634, Title: The complexity of glucose time series is associated with short- and long-term mortality in critically ill adults: a multi-center, prospective, observational study., Abstract: The wealth of data taken from continuous glucose monitoring (CGM) remains to be fully used. We aimed to evaluate the relationship between a promising new CGM metric, complexity of glucose time series ...
ID: 2, Article ID: 38762619, Title: An "out of the box" approach for prevention of ketoacidosis in youth with poorly controlled type 1 diabetes: combined use of insulin pump and long-acting insulin., Abstract: 