In [1]:
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy
from spacy.training import Example
import random
import pickle

# Download necessary NLTK data packages
nltk.download('punkt')

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
import logging
logging.basicConfig(filename = 'nltk_log', level=logging.DEBUG, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
try:
    # Load JSON data
    with open('Corona2.json', 'r', encoding='utf-8') as f:
        df = json.load(f)
    logging.info("JSON data loaded successfully.")
except Exception as e:
    logging.error(f"Error loading JSON data: {e}")
    raise

In [5]:
training_data = []

try:
    for example in df['examples']:
        temp_dict = {}
        temp_dict['content'] = example['content']
        temp_dict['entities'] = []
        
        for annotation in example['annotations']:
            start = annotation['start']
            end = annotation['end']
            label = annotation['tag_name'].upper()
            temp_dict['entities'].append((start, end, label))
        
        training_data.append(temp_dict)

    logging.info(f"Training data created with {len(training_data)} examples.")
except Exception as e:
    logging.error(f"Error processing examples: {e}")
    raise

In [6]:
logging.debug(f"Training data sample: {training_data[:1]}")

In [7]:
try:
    nltk.download('punkt')
except Exception as e:
    logging.error(f"Error downloading 'punkt' tokenizer: {e}")

for item in training_data:
    try:
        item['tokens'] = word_tokenize(item['content'])
    except Exception as e:
        logging.error(f"Error tokenizing content: {item['content'][:30]}... - {e}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
if training_data:
    logging.debug(f"Tokens for the first item: {training_data[0]['tokens']}")

try:
    nltk.download('averaged_perceptron_tagger')
except Exception as e:
    logging.error(f"Error downloading 'averaged_perceptron_tagger': {e}")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\syadav18\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
for item in training_data:
    try:
        item['pos_tags'] = nltk.pos_tag(item['tokens'])
    except Exception as e:
        logging.error(f"Error tagging POS for tokens: {item['tokens'][:10]}... - {e}")

if training_data:
    logging.debug(f"POS tags for the first item: {training_data[0]['pos_tags']}")

In [10]:
from nltk.chunk import ne_chunk

try:
    for item in training_data:
        item['chunks'] = ne_chunk(item['pos_tags'])
    logging.info("Named entity chunks created successfully.")
except Exception as e:
    logging.error(f"Error during named entity chunking: {e}")
    raise

# Show chunks for the first item
if training_data:
    logging.debug(f"Chunks for the first item: {training_data[0]['chunks']}")

In [11]:
def prepare_data_for_ner(training_data):
    formatted_data = []
    for item in training_data:
        content = item['content']
        tokens = item['tokens']
        pos_tags = item['pos_tags']
        entities = item['entities']

        ner_tags = ['O'] * len(tokens)
        try:
            for start, end, label in entities:
                for i in range(len(tokens)):
                    token_start = content.find(tokens[i])
                    token_end = token_start + len(tokens[i])
                    if token_start == start:
                        ner_tags[i] = 'B-' + label
                    elif start < token_start < end:
                        ner_tags[i] = 'I-' + label

            formatted_data.append(list(zip(tokens, pos_tags, ner_tags)))
        except Exception as e:
            logging.error(f"Error processing item {content[:30]}...: {e}")
    
    logging.info("Formatted data for NER created successfully.")
    return formatted_data

try:
    formatted_data = prepare_data_for_ner(training_data)
    logging.info("NER data prepared.")
except Exception as e:
    logging.error(f"Error preparing NER data: {e}")
    raise

if formatted_data:
    logging.debug(f"Formatted data for the first item: {formatted_data[0]}")

In [12]:
def filter_overlapping_entities(entities):
    sorted_entities = sorted(entities, key=lambda x: x[0])
    filtered_entities = []

    for current in sorted_entities:
        if not filtered_entities:
            filtered_entities.append(current)
        else:
            last = filtered_entities[-1]
            if current[0] < last[1]:  
                if current[1] <= last[1]:  
                    continue
                else:  #
                    new_entity = (last[0], current[1], last[2])
                    filtered_entities[-1] = new_entity
            else:
                filtered_entities.append(current)
    
    return filtered_entities

try:
    for item in training_data:
        item['entities'] = filter_overlapping_entities(item['entities'])
    logging.info("Overlapping entities filtered successfully.")
except Exception as e:
    logging.error(f"Error filtering overlapping entities: {e}")
    raise

In [13]:
train_data = []
try:
    for item in training_data:
        entities = [(start, end, label) for start, end, label in item['entities']]
        train_data.append((item['content'], {"entities": entities}))
    logging.info("Training data prepared successfully.")
except Exception as e:
    logging.error(f"Error preparing training data: {e}")
    raise


if train_data:
    logging.debug(f"Sample training data: {train_data[0]}")

In [14]:
try:
    nlp = spacy.blank('en')
    logging.info("SpaCy model initialized successfully.")
except Exception as e:
    logging.error(f"Error initializing SpaCy model: {e}")
    raise

In [15]:
try:
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner')
        logging.info("NER pipe added to the model.")
    else:
        ner = nlp.get_pipe('ner')
        logging.info("Using existing NER pipe.")
except Exception as e:
    logging.error(f"Error adding/getting NER pipe: {e}")
    raise

In [16]:
try:
    for item in training_data:
        for start, end, label in item['entities']:
            ner.add_label(label)
    logging.info("Labels added to NER pipe.")
except Exception as e:
    logging.error(f"Error adding labels to NER pipe: {e}")
    raise

In [17]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    try:
        optimizer = nlp.begin_training()
        logging.info("Training started.")
        for itn in range(100):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
                print(f"Iteration {itn}, Losses: {losses}")
            logging.info(f"Iteration {itn}, Losses: {losses}")
    except Exception as e:
        logging.error(f"Error during training: {e}")
        raise

[2024-07-26 20:35:10,492] [DEBUG] No 'get_examples' callback provided to 'Language.initialize', creating dummy examples
[2024-07-26 20:35:10,495] [INFO] Created vocabulary
[2024-07-26 20:35:10,496] [INFO] Finished initializing nlp object

Load the table in your config with:

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]



Iteration 0, Losses: {'ner': 105.85713315010071}
Iteration 0, Losses: {'ner': 149.59978940337896}
Iteration 0, Losses: {'ner': 225.6424277499318}
Iteration 0, Losses: {'ner': 314.3035279735923}
Iteration 0, Losses: {'ner': 437.78517973423004}
Iteration 0, Losses: {'ner': 472.65077036619186}
Iteration 0, Losses: {'ner': 504.29091411828995}
Iteration 0, Losses: {'ner': 557.147925645113}
Iteration 0, Losses: {'ner': 636.8428419828415}
Iteration 0, Losses: {'ner': 683.5559456646442}
Iteration 0, Losses: {'ner': 712.8466645628214}
Iteration 0, Losses: {'ner': 723.2823398262262}
Iteration 0, Losses: {'ner': 760.0178983733058}
Iteration 0, Losses: {'ner': 788.2395605635829}
Iteration 0, Losses: {'ner': 801.0309896113831}
Iteration 0, Losses: {'ner': 808.6461646389544}




Iteration 0, Losses: {'ner': 858.8345167761327}
Iteration 0, Losses: {'ner': 882.3201907986117}
Iteration 0, Losses: {'ner': 895.6233971312412}
Iteration 0, Losses: {'ner': 911.3729111920641}
Iteration 0, Losses: {'ner': 913.3841495782253}
Iteration 0, Losses: {'ner': 945.0779138991566}
Iteration 0, Losses: {'ner': 955.0355760435392}




Iteration 0, Losses: {'ner': 964.8926952167079}
Iteration 0, Losses: {'ner': 969.3252444223045}
Iteration 0, Losses: {'ner': 1019.6376389561802}
Iteration 0, Losses: {'ner': 1027.937494726221}
Iteration 0, Losses: {'ner': 1036.0098443260529}
Iteration 0, Losses: {'ner': 1050.9123360950455}




Iteration 0, Losses: {'ner': 1068.6856639114417}
Iteration 0, Losses: {'ner': 1068.881487397574}
Iteration 1, Losses: {'ner': 12.449503665966404}
Iteration 1, Losses: {'ner': 18.373450451091458}
Iteration 1, Losses: {'ner': 50.172202935119685}
Iteration 1, Losses: {'ner': 100.1531293996643}
Iteration 1, Losses: {'ner': 122.77389598775085}
Iteration 1, Losses: {'ner': 149.64858075138267}
Iteration 1, Losses: {'ner': 160.32454086456067}
Iteration 1, Losses: {'ner': 173.83027291746197}
Iteration 1, Losses: {'ner': 189.30337563382722}
Iteration 1, Losses: {'ner': 206.6341965102559}
Iteration 1, Losses: {'ner': 216.0308275811053}
Iteration 1, Losses: {'ner': 217.56103477712227}
Iteration 1, Losses: {'ner': 234.19080304253353}
Iteration 1, Losses: {'ner': 235.5920475842219}
Iteration 1, Losses: {'ner': 244.4853890692165}
Iteration 1, Losses: {'ner': 253.6495894624376}
Iteration 1, Losses: {'ner': 261.57490028082145}
Iteration 1, Losses: {'ner': 269.62821584779056}
Iteration 1, Losses: {'ner'

In [18]:
output_dir = "C:/Users/syadav18/Desktop/Ml tasks/Task 5"
try:
    nlp.to_disk(output_dir)
    logging.info(f"Model saved to {output_dir}.")
except Exception as e:
    logging.error(f"Error saving model to disk: {e}")
    raise

In [19]:
try:
    with open("ner_model.pkl", "wb") as f:
        pickle.dump(nlp, f)
    logging.info("Model serialized using pickle.")
except Exception as e:
    logging.error(f"Error serializing model with pickle: {e}")
    raise

In [20]:
try:
    nlp = spacy.load(output_dir)
    logging.info("Model loaded successfully for testing.")
except Exception as e:
    logging.error(f"Error loading model: {e}")
    raise

In [21]:
test_text = "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence."
try:
    doc = nlp(test_text)
    logging.info("Test text processed successfully.")
except Exception as e:
    logging.error(f"Error processing test text: {e}")
    raise

logging.info("Extracting entities from test text.")
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Entities in 'While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]

Diosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.

Racecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.'
bismuth compounds 6 23 MEDICINE
Pepto-Bismol 25 37 MEDICINE
diarrhea 104 112 MEDICALCONDITION
loperamide

In [22]:
try:
    nlp = spacy.load(output_dir)
    logging.info("SpaCy model loaded successfully.")
except Exception as e:
    logging.error(f"Error loading SpaCy model: {e}")
    raise

In [23]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


# nlp = spacy.load(output_dir)

test_data = [
    {
        "text": """Although viruses cause disruption of healthy homeostasis, resulting in disease, they may exist relatively harmlessly within an organism. An example would include the ability of the herpes simplex virus, which causes cold sores, to remain in a dormant state within the human body. 
        This is called latency[153] and is a characteristic of the herpes viruses, including Epstein–Barr virus, which causes glandular fever, and varicella zoster virus, which causes chickenpox and shingles. Most people have been infected with at least one of these types of herpes virus.
        [154] These latent viruses might sometimes be beneficial, as the presence of the virus can increase immunity against bacterial pathogens, such as Yersinia pestis.[155]""",
        "entities": [
                     (471, 479, 'MEDICALCONDITION'),
                     (419, 441, 'PATHOGEN'),
                     (365, 383, 'PATHOGEN'),
                     (707, 722, 'PATHOGEN'),
                     (181, 201, 'PATHOGEN'),
                     (456, 467, 'MEDICALCONDITION')]
    },
    {
        "text": """Examples of common human diseases caused by viruses include the common cold, influenza, chickenpox, and cold sores. Many serious diseases such as rabies, Ebola virus disease, AIDS (HIV), avian influenza, and SARS are caused by viruses. 
        The relative ability of viruses to cause disease is described in terms of virulence. Other diseases are under investigation to discover if they have a virus as the causative agent, such as the possible connection between human herpesvirus 6 (HHV6) and neurological diseases such as multiple sclerosis and chronic fatigue syndrome.
        [151] There is controversy over whether the bornavirus, previously thought to cause neurological diseases in horses, could be responsible for psychiatric illnesses in humans.[152]""",
        "entities": [(518, 536, 'MEDICALCONDITION'),
                     (154, 165, 'PATHOGEN'),
                     (708, 729, 'MEDICALCONDITION'),
                     (463, 476, 'PATHOGEN'),
                     (77, 86, 'MEDICALCONDITION'),
                     (88, 98, 'MEDICALCONDITION'),
                     (187, 202, 'MEDICALCONDITION'),
                     (610, 620, 'PATHOGEN')]
    },
    {
        "text": """All medical applications known so far involve not pure adamantane, but its derivatives. The first adamantane derivative used as a drug was amantadine – first (1967) as an antiviral drug against various strains of flu[50] and then to treat Parkinson's disease.
        [51][52] Other drugs among adamantane derivatives include adapalene, adapromine, bromantane, carmantadine, chlodantane, dopamantine, memantine, rimantadine, saxagliptin, tromantadine, and vildagliptin. Polymers of adamantane have been patented as antiviral agents against HIV.[53]""",
        "entities": [(239, 258, 'MEDICALCONDITION'),
                     (55, 65, 'MEDICINE'),
                     (531, 534, 'PATHOGEN'),
                     (416, 427, 'MEDICINE'),
                     (379, 390, 'MEDICINE'),
                     (352, 364, 'MEDICINE'),
                     (139, 149, 'MEDICINE')]
    },
    {
        "text": """Buprenorphine has been shown experimentally (1982–1995) to be effective against severe, refractory depression""",
        "entities": [(88, 109, 'MEDICALCONDITION'),
                     (0, 14, 'MEDICALCONDITION')]
    },
    {
        "text": """Gabapentin, approved for treatment of seizures and postherpetic neuralgia in adults, has side-effects which are useful in treating bipolar disorder1, 
        essential tremor, hot flashes, migraine prophylaxis, neuropathic pain syndromes, phantom limb syndrome, and restless leg syndrome.[11]""",
        "entities": [(203, 229, 'MEDICALCONDITION'),
                     (258, 279, 'MEDICALCONDITION'),
                     (181, 201, 'MEDICALCONDITION'),
                     (51, 73, 'MEDICALCONDITION'),
                     (0, 10, 'MEDICINE'),
                     (38, 46, 'MEDICALCONDITION')]
    },
    {
        "text": """Bupropion (Wellbutrin), an anti-depressant, is also used as a smoking cessation aid; this indication was later approved, and the name of the smoking cessation product is Zyban. 
        In Ontario, Canada, smoking cessation drugs are not covered by provincial drug plans; elsewhere, Zyban is priced higher than Wellbutrin, despite being the same drug. Therefore, some physicians prescribe Wellbutrin for both indications.[""",
        "entities": [(274, 279, 'MEDICINE'),
                     (11, 21, 'MEDICINE'),
                     (302, 312, 'MEDICINE'),
                     (380, 390, 'MEDICINE'),
                     (170, 175, 'MEDICINE'),
                     (0, 9, 'MEDICINE')]
    },
    {
        "text": """Carbamazepine is an approved treatment for bipolar disorder and epileptic seizures, but it has side effects useful in treating attention-deficit hyperactivity disorder (ADHD), 
        schizophrenia, phantom limb syndrome, paroxysmal extreme pain disorder, neuromyotonia, and post-traumatic stress disorder.[8]""",
        "entities": [(267, 288, 'MEDICALCONDITION'),
                     (248, 261, 'MEDICALCONDITION'),
                     (43, 59, 'MEDICALCONDITION'),
                     (145, 167, 'MEDICALCONDITION'),
                     (0, 14, 'MEDICALCONDITION'),
                     (176, 189, 'MEDICALCONDITION'),
                     (64, 82, 'MEDICALCONDITION'),
                     (191, 212, 'MEDICALCONDITION')]
    },
    {
        "text": """The antiviral drugs amantadine and rimantadine inhibit a viral ion channel (M2 protein), thus inhibiting replication of the influenza A virus.
        [86] These drugs are sometimes effective against influenza A if given early in the infection but are ineffective against influenza B viruses, which lack the M2 drug target.[160] Measured resistance to amantadine and rimantadine in American isolates of H3N2 has increased to 91% in 2005.[161] This high level of resistance may be due to the easy availability of amantadines as part of over-the-counter cold remedies in countries such as China and Russia,[162] and their use to prevent outbreaks of influenza in farmed poultry.
        [163][164] The CDC recommended against using M2 inhibitors during the 2005–06 influenza season due to high levels of drug resistance.[165]""",
        "entities": [(639, 648, 'MEDICALCONDITION'),
                     (35, 46, 'MEDICINE'),
                     (712, 725, 'MEDICINE'),
                     (20, 30, 'MEDICINE')]
    },
    {
        "text": """The two classes of antiviral drugs used against influenza are neuraminidase inhibitors (oseltamivir, zanamivir, laninamivir and peramivir) and M2 protein inhibitors (adamantane derivatives)""",
        "entities": [(128, 137, 'MEDICINE'),
                     (101, 110, 'MEDICINE'),
                     (112, 124, 'MEDICALCONDITION'),
                     (48, 57, 'MEDICALCONDITION'),
                     (88, 99, 'MEDICINE')]
    },
     {
        "text": """Influenza, commonly known as "the flu", is an infectious disease caused by an influenza virus.[1] Symptoms can be mild to severe.
        [5] The most common symptoms include: high fever, runny nose, sore throat, muscle and joint pain, headache, coughing, and feeling tired.
        [1] These symptoms typically begin two days after exposure to the virus and most last less than a week.[1] The cough, however, may last for more than two weeks.
        [1] In children, there may be diarrhea and vomiting, but these are not common in adults.[6] Diarrhea and vomiting occur more commonly in gastroenteritis, which is an unrelated disease and sometimes inaccurately referred to as "stomach flu" or the "24-hour flu".
        [6] Complications of influenza may include viral pneumonia, secondary bacterial pneumonia, sinus infections, and worsening of previous health problems such as asthma or heart failure.[2][5]""",
        "entities": [(191, 202, 'MEDICALCONDITION'),
                     (0, 9, 'MEDICALCONDITION'),
                     (845, 852, 'MEDICALCONDITION'),
                     (756, 775, 'PATHOGEN'),
                     (468, 476, 'MEDICALCONDITION'),
                     (227, 235, 'MEDICALCONDITION'),
                     (237, 245, 'MEDICALCONDITION'),
                     (777, 793, 'MEDICALCONDITION'),
                     (855, 868, 'MEDICALCONDITION'),
                     (215, 225, 'MEDICALCONDITION'),
                     (652, 663, 'MEDICALCONDITION'),
                     (455, 464, 'MEDICALCONDITION'),
                     (251, 264, 'MEDICALCONDITION'),
                     (78, 93, 'PATHOGEN')]
    }
]


In [24]:
examples = []
try:
    for item in test_data:
        text = item["text"]
        annotations = {"entities": item["entities"]}
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    logging.info("Converted test data to spaCy examples successfully.")
except Exception as e:
    logging.error(f"Error converting test data to spaCy examples: {e}")
    raise



In [25]:
from spacy.scorer import Scorer
scorer = Scorer()
try:
    scores = scorer.score(examples)
    logging.info("Scoring completed successfully.")
except Exception as e:
    logging.error(f"Error during scoring: {e}")
    raise