<a href="https://colab.research.google.com/github/SanthoshiRavi/Healthcare-Text-Processing-/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to fetch articles from PubMed
def fetch_pubmed_articles(api_key, query, max_results=100):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json&api_key={api_key}"
    response = requests.get(url)

    # Debug: Print the raw response
    print("API Response:", response.text)

    if 'esearchresult' in response.json():
        article_ids = response.json()['esearchresult']['idlist']
        articles = []
        for article_id in article_ids:
            details_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={article_id}&retmode=json&api_key={api_key}"
            details_response = requests.get(details_url)
            article_details = details_response.json()['result'][article_id]
            title = article_details.get('title', '')
            abstract = article_details.get('summary', '')
            articles.append((article_id, title, abstract))
        return articles
    else:
        print("Error: 'esearchresult' not found in the API response.")
        return []

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Function to predict article class
def predict_article_class(title, abstract, model, vectorizer, label_encoder):
    combined_text = f"{preprocess_text(title)} {preprocess_text(abstract)}"
    tfidf_features = vectorizer.transform([combined_text])
    prediction = model.predict(tfidf_features)
    predicted_class = label_encoder.inverse_transform(prediction)
    return predicted_class[0]

# Example usage
api_key = "14b3330cab5a4aabaefc418267dd19492909"
query = "diabetes"
articles = fetch_pubmed_articles(api_key, query, max_results=100)

# Check if articles were fetched successfully
if articles:
    # Display fetched articles
    print("Fetched articles:")
    for idx, (article_id, title, abstract) in enumerate(articles):
        print(f"ID: {idx}, Article ID: {article_id}, Title: {title}, Abstract: {abstract[:200]}")  # Display first 200 chars of abstract

    # Prompt user to input the article ID for prediction
    article_id_input = int(input("Enter the article ID for which you want to predict the label: "))
    selected_article = articles[article_id_input]

    # Preprocess articles
    preprocessed_articles = [(preprocess_text(title), preprocess_text(abstract)) for _, title, abstract in articles]
    combined_texts = [f"{title} {abstract}" for title, abstract in preprocessed_articles]

    # Create TF-IDF features
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_features = tfidf_vectorizer.fit_transform(combined_texts)

    # Print the TF-IDF matrix
    print("TF-IDF Matrix:")
    print(tfidf_features.toarray())

    # Optionally, convert to DataFrame for better readability
    tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    print(tfidf_df.head())

    # Ensure the labels match the number of samples in tfidf_features
    num_samples = tfidf_features.shape[0]

    # Example labels (replace with actual labels if available)
    labels = ['treatment', 'diagnosis', 'prevention'] * ((num_samples // 3) + 1)
    labels = labels[:num_samples]  # Ensure labels list matches the length of the data

    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    # Verify lengths before encoding
    print(f"Number of samples in tfidf_features: {len(tfidf_features.toarray())}")
    print(f"Number of labels: {len(labels)}")
    print(f"First 5 labels: {labels[:5]}")  # First 5 labels

    # Ensure consistent lengths before encoding
    assert len(tfidf_features.toarray()) == len(labels), "Mismatch between the number of samples and labels."

    # Verify lengths again after encoding
    print(f"Number of encoded labels: {len(y)}")
    print(f"Encoded Labels: {y[:5]}")  # First 5 encoded labels

    # Ensure consistent lengths after encoding
    assert len(tfidf_features.toarray()) == len(y), "Mismatch between the number of samples and encoded labels."

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_features, y, test_size=0.2, random_state=42)

    # Train the model
    log_reg_model = LogisticRegression(max_iter=200)
    log_reg_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = log_reg_model.predict(X_test)

    # Evaluate the model
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Predict the class for the selected article
    selected_title, selected_abstract = selected_article[1], selected_article[2]
    predicted_class = predict_article_class(selected_title, selected_abstract, log_reg_model, tfidf_vectorizer, label_encoder)
    print(f"The predicted class for the article (ID: {article_id_input}) is: {predicted_class}")
    print(f"Title: {selected_title}")
    print(f"Abstract: {selected_abstract}")
else:
    print("No articles fetched. Please check your API key and query.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


API Response: {"header":{"type":"esearch","version":"0.3"},"esearchresult":{"count":"986827","retmax":"100","retstart":"0","idlist":["38760860","38760852","38760808","38760732","38760705","38760704","38760678","38760661","38760656","38760653","38760651","38760632","38760619","38760615","38760595","38760590","38760578","38760569","38760568","38760566","38760482","38760456","38760452","38760446","38760435","38760427","38760424","38760383","38760348","38760299","38760264","38760159","38760126","38760125","38760071","38760053","38760037","38760035","38760033","38760001","38759989","38759983","38759910","38759874","38759873","38759850","38759836","38759835","38759826","38759794","38759779","38759763","38759761","38759722","38759707","38759658","38759587","38759539","38759515","38759500","38759477","38759475","38759430","38759424","38759418","38759318","38759315","38759306","38759234","38759226","38759219","38759136","38759133","38759110","38759099","38759055","38758998","38758937","38758936