In [1]:
pip install biopython Bio

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Bio
  Downloading bio-1.6.2-py3-none-any.whl (278 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.6.2 biopython-1.83 biothings-client-0.3.1 gprofiler-official-1.0.0 mygene-3.2.2


In [14]:
from Bio import Entrez
import datetime

def search_pubmed(keyword, start_year, end_year, batch_size, max_records):
    Entrez.email = 'YourEmail@example.com'  # Replace with your email address
    query = f'{keyword}[Abstract] AND ("{start_year}"[Date - Publication] : "{end_year}"[Date - Publication])'
    search_handle = Entrez.esearch(db='pubmed', term=query, retmax=batch_size, usehistory='y', sort='relevance')
    search_results = Entrez.read(search_handle)
    search_handle.close()

    count = int(search_results["Count"])
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    ids = []
    for start in range(0, min(max_records, count), batch_size):
        fetch_handle = Entrez.efetch(db='pubmed',
                                     retmode='xml',
                                     retstart=start,
                                     retmax=batch_size,
                                     webenv=webenv,
                                     query_key=query_key)
        data = Entrez.read(fetch_handle)
        fetch_handle.close()
        ids.extend(data['PubmedArticle'])
    return ids

def fetch_details(pubmed_article):
    article = pubmed_article['MedlineCitation']['Article']
    abstract = article.get('Abstract', {}).get('AbstractText', [''])[0]
    title = article.get('ArticleTitle', '')
    authors = [a.get('LastName', '') + ' ' + a.get('ForeName', '') for a in article.get('AuthorList', [])]
    pub_date = article.get('Journal', {}).get('JournalIssue', {}).get('PubDate', {})
    year = pub_date.get('Year', '')
    return title, abstract, authors, year

# Define the search parameters
keyword = 'intelligence'
start_year = 2013
end_year = 2023
batch_size = 100
max_records = 1000

# Search PubMed
articles = search_pubmed(keyword, start_year, end_year, batch_size, max_records)

with open('pubmed_data.txt', 'w') as file:
    for article in articles:
        title, abstract, authors, year = fetch_details(article)
        file.write(f'Title: {title}\n')
        file.write(f'Year: {year}\n')
        file.write(f'Authors: {", ".join(authors)}\n')
        file.write(f'Abstract: {abstract}\n')
        file.write('\n' + '-'*80 + '\n\n')

print("Data written to pubmed_data.txt")


Data written to pubmed_data.txt


In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to parse the PubMed data file
def parse_pubmed_data(file_path):
    with open(file_path, 'r') as file:
        data = file.readlines()

    titles, years, authors, abstracts = [], [], [], []
    temp_title, temp_year, temp_authors, temp_abstract = None, None, None, None

    for line in data:
        if line.startswith('Title:'):
            temp_title = line[7:].strip()
        elif line.startswith('Year:'):
            temp_year = line[6:].strip()
        elif line.startswith('Authors:'):
            temp_authors = line[9:].strip()
        elif line.startswith('Abstract:'):
            temp_abstract = line[10:].strip()
        elif line == '\n':
            if temp_title and temp_year and temp_authors and temp_abstract:
                titles.append(temp_title)
                years.append(temp_year)
                authors.append(temp_authors)
                abstracts.append(temp_abstract)
            temp_title, temp_year, temp_authors, temp_abstract = None, None, None, None

    return pd.DataFrame({'Title': titles, 'Year': years, 'Authors': authors, 'Abstract': abstracts})

# Function to vectorize the abstracts using TF-IDF
def vectorize_abstracts(abstracts):
    tfidf_vectorizer = TfidfVectorizer(max_features=500)
    tfidf_vectors = tfidf_vectorizer.fit_transform(abstracts)
    return pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Main function to process and save the vectorized data
def process_and_save_vectorized_data(input_file_path, output_file_path):
    df = parse_pubmed_data(input_file_path)
    vector_df = vectorize_abstracts(df['Abstract'])
    vector_df.to_csv(output_file_path, index=False)
    print(f"Vectorized dataset saved to {output_file_path}")

# Usage
input_file_path = '/content/pubmed_data.txt'
output_file_path = 'vectorized_pubmed_data.csv'
process_and_save_vectorized_data(input_file_path, output_file_path)


Vectorized dataset saved to vectorized_pubmed_data.csv


In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Loading the vectorized dataset
def load_vectorized_data(file_path):
    return pd.read_csv(file_path)

def create_query_vector(query, vectorized_data_columns):
    # Tokenizing the query - simplistic tokenization based on spaces
    query_tokens = set(query.lower().split())

    # Creating an empty query vector
    query_vector = np.zeros(len(vectorized_data_columns))

    # Populating the query vector: 1 if the term in vectorized data columns exists in the query, else 0
    for i, term in enumerate(vectorized_data_columns):
        if term in query_tokens:
            query_vector[i] = 1

    return query_vector

# Function to find the most relevant documents for a query
def find_relevant_documents(query_vector, vectorized_data, top_n=5):
    cosine_similarities = cosine_similarity([query_vector], vectorized_data).flatten()
    relevant_doc_indices = np.argsort(cosine_similarities)[-top_n:][::-1]
    return relevant_doc_indices

# Main function for IR system
def information_retrieval_system(query, vector_file_path, top_n=5):
    vectorized_data = load_vectorized_data(vector_file_path)
    query_vector = create_query_vector(query, vectorized_data.columns)
    relevant_docs = find_relevant_documents(query_vector, vectorized_data, top_n)
    print(f"Top {top_n} relevant document indices for the query '{query}': {relevant_docs}")

# Usage
query = "medical"
vector_file_path = '/content/vectorized_pubmed_data.csv'
information_retrieval_system(query, vector_file_path)


Top 5 relevant document indices for the query 'medical': [442  10 388 460 472]


In [33]:
import pandas as pd

# Reading the dataset
file_path = '/content/pubmed_data.txt'

# Initializing lists to store the different parts of the articles
titles = []
years = []
authors = []
abstracts = []

# Parsing the file
with open(file_path, 'r') as file:
    article_lines = []
    for line in file:
        # Check if the line is a separator between articles
        if line.strip() == "":
            if article_lines:
                title = next((line.split(": ")[1].strip() for line in article_lines if line.startswith("Title: ")), None)
                year = next((line.split(": ")[1].strip() for line in article_lines if line.startswith("Year: ")), None)
                author = next((line.split(": ")[1].strip() for line in article_lines if line.startswith("Authors: ")), None)
                abstract = next((line.split(": ")[1].strip() for line in article_lines if line.startswith("Abstract: ")), None)

                titles.append(title)
                years.append(year)
                authors.append(author)
                abstracts.append(abstract)

                article_lines = []
            continue
        article_lines.append(line)

# Creating a DataFrame
df = pd.DataFrame({
    'Title': titles,
    'Year': years,
    'Authors': authors,
    'Abstract': abstracts
})

print(df.head())


                                               Title  Year  \
0  The relation between intelligence and religiosity  2013   
1                                               None  None   
2     Neuroscience-Inspired Artificial Intelligence.  2017   
3                                               None  None   
4  Role of emotional intelligence in effective nu...  2021   

                                             Authors  \
0   Zuckerman Miron, Silberman Jordan, Hall Judith A   
1                                               None   
2  Hassabis Demis, Kumaran Dharshan, Summerfield ...   
3                                               None   
4                                      Lambert Steve   

                                            Abstract  
0  A meta-analysis of 63 studies showed a signifi...  
1                                               None  
2  The fields of neuroscience and artificial inte...  
3                                               None  
4  Emotionally 

In [39]:
df.head()

Unnamed: 0,Title,Year,Authors,Abstract
0,The relation between intelligence and religiosity,2013.0,"Zuckerman Miron, Silberman Jordan, Hall Judith A",A meta-analysis of 63 studies showed a signifi...
1,,,,
2,Neuroscience-Inspired Artificial Intelligence.,2017.0,"Hassabis Demis, Kumaran Dharshan, Summerfield ...",The fields of neuroscience and artificial inte...
3,,,,
4,Role of emotional intelligence in effective nu...,2021.0,Lambert Steve,Emotionally intelligent leaders demonstrate a ...


In [48]:
# Import necessary libraries
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

# Loading DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

# Defining the function for answering questions using DistilBERT
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer

# Question type classification function (basic heuristic)
def classify_question_type(question):
    if question.lower().startswith("is") or question.lower().startswith("are"):
        return "Confirmation"
    elif "why" in question.lower():
        return "Causal"
    elif "what" in question.lower():
        return "Factoid"
    elif "list" in question.lower():
        return "List"
    elif "would" in question.lower() or "could" in question.lower():
        return "Hypothetical"
    else:
        return "Complex"

# User input
user_question = input("Please enter your question: ")

question_type = classify_question_type(user_question)

context = df['Abstract'].iloc[0]

# Finding the answer
answer = answer_question(user_question, context)
print(f"Question Type: {question_type}\nAnswer: {answer}\n")


Please enter your question: Why is emotional intelligence required in nurse leadership?
Question Type: Causal
Answer: intelligent people may therefore have less need for religious beliefs and practices

