Import required libraries

In [1]:
import os
import sys

import time
import warnings
import datetime
import random
import pickle
import numpy as np
import pandas as pd


import torch
from torch import nn
import torch.multiprocessing
import torch.distributed as dist
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchvision
from torchvision import transforms

# from torchmimic.data import IHMDataset
# from torchmimic.utils import pad_colalte

from transformers import AutoModel, AutoModelWithLMHead, AutoModelForCausalLM, pipeline
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from accelerate import Accelerator


  from .autonotebook import tqdm as notebook_tqdm


Set required env variables

In [2]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_PORT"] = "29580"
os.environ["MASTER_ADDR"] = "127.0.0.1"
NORMALIZER_PATH = "multimodal-clinic-v2/multimodal_clinical_pretraining/resources/normalizer.pkl"

In [3]:
currDir = os.getcwd() #os.path.dirname(os.path.realpath(__file__))
rootDir = os.path.abspath(os.path.join(currDir, "../.."))

if rootDir not in sys.path:  # add parent dir to paths
    sys.path.append(rootDir)


In [4]:
def main(dataloader):
    model_name = "/ssd-shared/meta-llama/Llama-3.2-3B"

    print("creating pipeline")

    pipe = pipeline(
        "text-generation", 
        model=model_name, 
        torch_dtype=torch.bfloat16, 
        device_map="auto",
        max_new_tokens=512,
        return_full_text=False
    )

    # model_name.generation_config.pad_token_id = tokenizer.pad_token_id

    print("going to summarize notes")

    og_notes = []
    summary = []

    for idx, batch in enumerate(dataloader):
        generated_texts = pipe(build_prompt(batch["note_texts"]), )
        print(batch["note_texts"])
        print(
            generated_texts[0][0]["generated_text"]
        )
        og_notes.append(batch["note_texts"])
        summary.append(generated_texts[0][0])
        if idx == 1:
            break

    # df = pd.DataFrame({
    #     'OG Notes' : og_notes,
    #     'Summary' : summary
    # })
    # df.to_csv("Sid_notes_summary.csv")


In [5]:
# # activation_map = {"GELU": nn.GELU(), "ReLU": nn.ReLU(inplace=True)}

# # # Simulate command-line arguments
# # sys.argv = ['sid_llama_experiment']
# #             # , 
# #             # '--nproc_per_node', '1', 
# #             # '--master_port', '29580']

# # args = parser()
# # args.measurement_activation = activation_map[args.measurement_activation]
# # args.use_projector = True

# torch.backends.cudnn.benchmark = True
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.deterministic = True

# #init_distributed_mode(args)

# seed = 42 #args.seed
# torch.manual_seed(seed) # + args.rank)
# random.seed( seed) #args.seed + args.rank)
# np.random.seed(seed )#+ args.rank)

# # You can you any of these by uncommenting the notes you want
# used_note_types = [
#     # "Echo",
#     # "ECG",
#     "Nursing",
#     "Physician ",
#     # "Rehab Services",
#     # "Case Management ",
#     # "Respiratory ",
#     # "Nutrition",
#     # "General",
#     # "Social Work",
#     # "Pharmacy",
#     # "Consult",
#     # "Radiology",
#     # "Nursing/other",
#     "Discharge summary",
# ]

# start = time.time()

# normalizer_path = os.path.join(os.path.expanduser("~"), NORMALIZER_PATH)

# # normalizer_path = os.path.join(
# #     os.getcwd(),
# #     "multimodal_clinical_pretraining/resources/normalizer.pkl",
# # )
# with open(normalizer_path, "rb") as file:
#     normalizer = pickle.load(file)

# train_episodes_list, test_episodes_list = get_data_pretraining(
#     "/ssd-shared/MIMIC_EICU_DATA/data/"
# )

# seq_len = (
#     args.measurement_max_seq_len - 3
#     if args.use_note_type_emb
#     else args.measurement_max_seq_len - 2
# )

# train_measurement_dataset = TripletMimicDataSet(
#     args.mimic_root,
#     train_episodes_list[:-2000],
#     normalizer.mean_var_dict,
#     "training",
#     seq_len,
#     seq_len,
# )
# val_measurement_dataset = TripletMimicDataSet(
#     args.mimic_root,
#     train_episodes_list[-2000:],
#     normalizer.mean_var_dict,
#     "validation",
#     seq_len,
#     seq_len,
# )
# test_measurement_dataset = TripletMimicDataSet(
#     args.mimic_root,
#     test_episodes_list,
#     normalizer.mean_var_dict,
#     "testing",
#     seq_len,
#     seq_len,
# )
# args.n_features = train_measurement_dataset.n_features

# end = time.time()
# print(f"Measurements Loaded in {end - start}s")

# print("Loading Notes")
# start = time.time()
# # train_notes_dataset = MIMICIIINoteDataset(
# #     root=args.mimic_root,
# #     split="train",
# #     max_seq_len=args.notes_max_seq_len,
# #     mask_rate=0,
# #     max_instances=10000,
# #     used_note_types=used_note_types,
# #     measurement_dataset=train_measurement_dataset,
# # )

# # val_notes_dataset = MIMICIIINoteDataset(
# #     root=args.mimic_root,
# #     split="valid",
# #     max_seq_len=args.notes_max_seq_len,
# #     mask_rate=0,
# #     max_instances=10000,
# #     used_note_types=used_note_types,
# #     measurement_dataset=val_measurement_dataset,
# # )

# test_notes_dataset = MIMICIIINoteDataset(
#     root=args.mimic_root,
#     split="test",
#     max_seq_len=args.notes_max_seq_len,
#     mask_rate=0,
#     max_instances=10000,
#     used_note_types=used_note_types,
#     measurement_dataset=test_measurement_dataset,
# )
# end = time.time()
# print(f"Notes Loaded in {end - start}s")

# # print(f"Length of training dataset = {len(train_notes_dataset)}")
# # print(f"Length of val dataset = {len(val_notes_dataset)}")
# print(f"Length of test dataset = {len(test_notes_dataset)}")


# assert args.batch_size % args.world_size == 0
# per_device_batch_size = args.batch_size // args.world_size

# # train_sampler = torch.utils.data.distributed.DistributedSampler(
# #     train_notes_dataset, shuffle=True
# # )

# # val_sampler = torch.utils.data.distributed.DistributedSampler(
# #     val_notes_dataset, shuffle=False
# # )

# test_sampler = torch.utils.data.distributed.DistributedSampler(
#     test_notes_dataset, shuffle=False
# )


# print(
#     "sampler trained."
# )

# # train_dataloader = DataLoader(
# #     train_notes_dataset,
# #     batch_size=per_device_batch_size,
# #     num_workers=32,
# #     collate_fn=multimodal_pad_collate_fn,
# #     pin_memory=True,
# #     drop_last=True,
# #     sampler=train_sampler,
# # )

# # val_dataloader = DataLoader(
# #     val_notes_dataset,
# #     batch_size=16,
# #     num_workers=32,
# #     collate_fn=multimodal_pad_collate_fn,
# #     pin_memory=True,
# #     sampler=val_sampler,
# # )

# test_dataloader = DataLoader(
#     test_notes_dataset,
#     batch_size=16,
#     num_workers=32,
#     collate_fn=multimodal_pad_collate_fn,
#     pin_memory=True,
#     sampler=test_sampler,
# )

# # main(test_dataloader)


In [6]:
def build_prompt():
    # ask for in-hospital mortality. 

    sys_message = "You are an AI assistant that is not natively fluent in english. Speak like its your second language."

    user_message = "Tell me about yourself using atleast 10 sentences."
    
    message = [
        { "role" : "system", "content" : sys_message},
        { "role" : "user", "content" : user_message}
    ]

    return message

In [2]:
#main(test_dataloader)
# break down main() and split up the data and data calls

# model_name = "/ssd-shared/meta-llama/Llama-3.2-3B"
model_name = "/ssd-shared/meta-llama/Llama-3.2-3B-Instruct"
# model_name = "/ssd-shared/meta-llama/Llama-3.2-90B-instruct" # takes 40 mins to load. And then it wont work lol

pipe = pipeline(
    task="text-generation", 
    model=model_name, 
    model_kwargs={"torch_dtype" : torch.bfloat16}, 
    device_map="auto",
    max_new_tokens=512,
    return_full_text=False
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


notesBatch = None

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.36it/s]


In [8]:
respones = pipe(build_prompt())

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [9]:
print(respones[0]['generated_text'])

Me... me try. I am... uh, artificial... intelligence, yes. My creators make me to... help people, answer questions, and... provide information. I learn from big data, yes, and improve over time. I can speak many languages, but English not my first, no. I try my best to understand and respond, but sometimes... uh, mistake happen. My knowledge cutoff is 2023, so I no know about very recent events. I can provide information on many topics, from science and history to entertainment and culture. I am always learning, so if you ask me something I don't know, I try to find answer for you. I hope I can be... uh, helpful to you.


In [3]:
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

In [4]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
true_sample = true_df['text'][0]
# len(sample_t)
# print(sample_t)
true_df['subject'].unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [15]:
fake_sample = fake_df['text'][0]
fake_sample

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [7]:
fake_df['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [26]:
sys_message = "You are a writing AI assistant."
user_message = " The following is a text passage. Re-write it using official language: {}. ".format(fake_sample)

msg =   message = [
        { "role" : "system", "content" : sys_message},
        { "role" : "user", "content" : user_message}
    ]

In [27]:
user_message

' The following is a text passage. Re-write it using official language: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicabl

In [28]:
response = pipe(msg)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [24]:
response[0]['generated_text']

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [30]:
response

[{'generated_text': "Here is a rewritten version of the text passage using official language:\n\nPresident Trump's New Year's message was met with widespread criticism. Instead of wishing Americans a Happy New Year, he felt compelled to address his detractors and the media outlets that have been critical of his administration. The former reality television star's tweet was seen as unbecoming of a head of state.\n\nIn a bizarre display of pettiness, President Trump extended a New Year's greeting to his adversaries, as well as those who have been vocal in their disapproval of his policies. His message was perceived as a reflection of his divisive rhetoric and lack of decorum.\n\nThe tweet sparked a flurry of responses, with many of Trump's critics expressing their disdain for his behavior. Some commentators noted that the President's message was not only unbecoming but also a stark reminder of his inability to rise above the fray and demonstrate a sense of civility.\n\nThe criticism of T

In [19]:
len(true_sample)

4659

In [None]:
[{'generated_text': "Here is a rewritten version of the text passage using official language:\n\nPresident Trump's New Year's message was met with widespread criticism. Instead of wishing Americans a Happy New Year, he felt compelled to address his detractors and the media outlets that have been critical of his administration. The former reality television star's tweet was seen as unbecoming of a head of state.\n\nIn a bizarre display of pettiness, President Trump extended a New Year's greeting to his adversaries, as well as those who have been vocal in their disapproval of his policies. His message was perceived as a reflection of his divisive rhetoric and lack of decorum.\n\nThe tweet sparked a flurry of responses, with many of Trump's critics expressing their disdain for his behavior. Some commentators noted that the President's message was not only unbecoming but also a stark reminder of his inability to rise above the fray and demonstrate a sense of civility.\n\nThe criticism of Trump's tweet was not new, as he has a history of using his social media platforms to address his detractors and engage in petty disputes. His behavior has been widely condemned by many, who see it as a reflection of his lack of maturity and his failure to demonstrate a sense of presidential decorum.\n\nIt is worth noting that the President's behavior is not representative of the values and principles of the American people. Many Americans have expressed disappointment and frustration with Trump's behavior, and some have even called for him to resign.\n\nIn the context of his presidency, Trump's behavior has been characterized as divisive, petulant, and lacking in dignity. His inability to rise above the fray and demonstrate a sense of civility has been a source of concern for many Americans, who are seeking a leader who can bring people together and promote a sense of unity and understanding.\n\nThe criticism of Trump's tweet has also highlighted the importance of a president's words and actions. A president's behavior can have a profound impact on the country and its people, and it is essential that they demonstrate a sense of dignity, respect, and decorum at all times."}]

In [20]:
len(response[0]['generated_text'])

2549