## **Task** 1: Extract 'text' from CSV files and store them into a single .txt file

In [2]:
import pandas as pd

# List of CSV files
csv_files = ['CSV1.csv', 'CSV2.csv', 'CSV3.csv', 'CSV4.csv']

# Open a new text file to store extracted texts
with open('all_texts.txt', 'w') as outfile:
    for file in csv_files:
        # Load each CSV file
        df = pd.read_csv(file)

        # Assuming the 'text' column contains the large text we need
        texts = df['TEXT'].tolist()

        # Write all text into a single file
        for text in texts:
            outfile.write(text + '\n')


## Task 2: Install and Set Up NLP **Libraries**

In [None]:
# Install required libraries
!pip install spacy scispacy transformers torch
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_bc5cdr_md-0.3.0.tar.gz

# Download BioBERT model from Hugging Face
!pip install transformers


## **Task** 3: Count Word Occurrences and Unique Tokens

3.1: Count Occurrences of Words and Store Top 30 Words in CSV

In [1]:
import pandas as pd

In [2]:
from collections import Counter

# Load the combined text file
with open('all_texts.txt', 'r') as file:
    text = file.read()

# Tokenize the text and count word occurrences
words = text.split()
word_counts = Counter(words)

# Get the top 30 most common words
top_30_words = word_counts.most_common(30)

# Save the top 30 words and their counts into a CSV file
top_words_df = pd.DataFrame(top_30_words, columns=['Word', 'Count'])
top_words_df.to_csv('top_30_words.csv', index=False)


3.2: Use Auto Tokenizer to Count Unique Tokens

In [None]:
from transformers import AutoTokenizer
from collections import Counter

# Load the text from the 'all_texts.txt' file
with open('all_texts.txt', 'r') as file:
    text = file.read()

# Load the BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def count_unique_tokens(text):
    # Tokenize the text using the BioBERT tokenizer
    tokens = tokenizer.tokenize(text)

    # Count the occurrences of each token
    token_counts = Counter(tokens)

    # Get the top 30 unique tokens
    top_30_tokens = token_counts.most_common(30)

    return top_30_tokens

# Get unique token counts
top_30_tokens = count_unique_tokens(text)

# Print the top 30 tokens and their counts
for token, count in top_30_tokens:
    print(f"Token: {token}, Count: {count}")


## **Task** 4: Named-Entity Recognition (NER) Using SpaCy, SciSpaCy, and BioBERT

Using SpaCy and SciSpaCy

In [None]:
import spacy

# Load the SpaCy models for NER
nlp_sci_sm = spacy.load("en_core_sci_sm")
nlp_bc5cdr_md = spacy.load("en_ner_bc5cdr_md")

def extract_entities(text, nlp_model):
    doc = nlp_model(text)
    diseases = [ent.text for ent in doc.ents if ent.label_ == "DISEASE"]
    drugs = [ent.text for ent in doc.ents if ent.label_ == "DRUG"]
    return diseases, drugs

# Load the text from 'all_texts.txt'
with open('all_texts.txt', 'r') as file:
    text = file.read()

# Extract entities using en_core_sci_sm
diseases_sci_sm, drugs_sci_sm = extract_entities(text, nlp_sci_sm)

# Extract entities using en_ner_bc5cdr_md
diseases_bc5cdr, drugs_bc5cdr = extract_entities(text, nlp_bc5cdr_md)

# Compare entity counts
print(f"Total diseases detected by en_core_sci_sm: {len(diseases_sci_sm)}")
print(f"Total drugs detected by en_core_sci_sm: {len(drugs_sci_sm)}")
print(f"Total diseases detected by en_ner_bc5cdr_md: {len(diseases_bc5cdr)}")
print(f"Total drugs detected by en_ner_bc5cdr_md: {len(drugs_bc5cdr)}")


Using BioBERT with Hugging Face

In [None]:
from transformers import pipeline

# Load the BioBERT model for NER
nlp_biobert = pipeline("ner", model="dmis-lab/biobert-base-cased-v1.1", tokenizer="dmis-lab/biobert-base-cased-v1.1")

# Apply the NER pipeline to the text
ner_results = nlp_biobert(text)

# Filter for disease and drug entities
diseases_biobert = [ent['word'] for ent in ner_results if 'disease' in ent['entity'].lower()]
drugs_biobert = [ent['word'] for ent in ner_results if 'drug' in ent['entity'].lower()]

print(f"Total diseases detected by BioBERT: {len(diseases_biobert)}")
print(f"Total drugs detected by BioBERT: {len(drugs_biobert)}")


Comparison of Models
::
To compare the models, we can look at the total number of entities detected, check for common words, and note the differences in performance or entity types:bold text

In [None]:
# Compare detected entities across models
print(f"Diseases - SpaCy en_core_sci_sm: {len(diseases_sci_sm)}")
print(f"Diseases - SciSpaCy en_ner_bc5cdr_md: {len(diseases_bc5cdr)}")
print(f"Diseases - BioBERT: {len(diseases_biobert)}")

print(f"Drugs - SpaCy en_core_sci_sm: {len(drugs_sci_sm)}")
print(f"Drugs - SciSpaCy en_ner_bc5cdr_md: {len(drugs_bc5cdr)}")
print(f"Drugs - BioBERT: {len(drugs_biobert)}")

# Check for common diseases and drugs between models
common_diseases = set(diseases_sci_sm).intersection(set(diseases_bc5cdr), set(diseases_biobert))
common_drugs = set(drugs_sci_sm).intersection(set(drugs_bc5cdr), set(drugs_biobert))

print(f"Common diseases across models: {common_diseases}")
print(f"Common drugs across models: {common_drugs}")
