# Importing Required Libraries

In [1]:
import mlflow
from langchain_core.prompts import PromptTemplate
from mlflow.models.signature import infer_signature
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import torch
import huggingface_hub
from transformers import BitsAndBytesConfig

huggingface_hub.login("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")

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/miniconda3/envs/paperpal/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/miniconda3/envs/paperpal/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/miniconda3/envs/paperpal/lib/python3.12/site-packages/ipykernel/kernela

# Loading the Test Data

In [10]:
para1 = "The Reddit dataset is a graph dataset from Reddit posts made in the month of September, 2014. The node label in this case is the community, or “subreddit”, that a post belongs to. 50 large communities have been sampled to build a post-to-post graph, connecting posts if the same user comments on both. In total this dataset contains 232,965 posts with an average degree of 492. The first 20 days are used for training and the remaining days for testing. For features, off-the-shelf 300-dimensional GloVe CommonCrawl word vectors are used."

# Loading the Model

In [3]:
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Configure 4-bit quantization
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16
# )

# if device == "cuda":
#     model_name = "meta-llama/Llama-3.2-1B-Instruct"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         quantization_config=quantization_config,
#         device_map="auto"
#     )
# else:
#     model_name = "google/flan-t5-small"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
#     model = model.to(device)

In [48]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

# Doing Experimentation

In [5]:
template = "Summarize the paragraph: {input_paragraph} Answer should contain only summary and nothing else."
template_input_variables = ["input_paragraph"]
model_input = dict()
model_input["input_paragraph"] = para1
prompt_template = PromptTemplate(
            input_variables=template_input_variables,
            template=template,
        )
formatted_prompt = prompt_template.format(**model_input)

In [6]:
inputs = tokenizer(formatted_prompt, return_tensors="pt", truncation=True)
outputs = model.generate(**inputs, max_length=512, num_beams=2, early_stopping=True)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

The Reddit dataset contains 232,965 posts with an average degree of 492.


# Mlflow

In [49]:
import os
os.environ["TOKENI?ZERS_PARALLELISM"] = "false"

In [50]:
import mlflow.pyfunc
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import huggingface_hub

# MLflow tracking
import mlflow
from mlflow.models.signature import infer_signature
import pandas as pd

# Login to Hugging Face
huggingface_hub.login("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")

In [51]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model details
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

In [52]:
# Define a custom PythonModel wrapper
class TextSummarizationModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model_name, device):
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = 500)
        self.device = device

    def predict(self, context, model_input):

        input_paragraphs= model_input["input_paragraph"]
        formatted_prompts = []

        for cur_input_paragraph in input_paragraphs:


            formatted_prompt = f"Summarize the paragraph: {cur_input_paragraph} Answer should contain only summary and nothing else."

            formatted_prompts.append(formatted_prompt)
        

        inputs = self.tokenizer(formatted_prompts, return_tensors="pt", truncation=True, padding=True)
        inputs = {key: value.to (self.device) for key, value in inputs.items()}

        # Generate summary
        outputs = self.model.generate(**inputs, max_length=512, num_beams=2, early_stopping=True)
        summaries = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
        return summaries

# Create an instance of the custom model
wrapped_model = TextSummarizationModel(model_name = "google/flan-t5-small", device=device)

In [53]:
# example_input = pd.DataFrame({"input_paragraph":[para1]})
# example_output = wrapped_model.predict(None, example_input)
# example_output

In [54]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Text Summarization Experiment")

2025/01/02 01:47:33 INFO mlflow.tracking.fluent: Experiment with name 'Text Summarization Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-experiments-663/938096856342841282', creation_time=1735800453063, experiment_id='938096856342841282', last_update_time=1735800453063, lifecycle_stage='active', name='Text Summarization Experiment', tags={}>

In [55]:
with mlflow.start_run(run_name="initial summarization experiment-4"):
    # Log parameters
    mlflow.log_param("model_name", model_name)
    mlflow.log_param("device", device)

    # Infer signature: Input is a DataFrame with a column "input_paragraph"
    example_input = pd.DataFrame({"input_paragraph":[para1, "Manav is writing his own bibiliography."]})
    example_output = wrapped_model.predict(None, example_input)
    signature = infer_signature(example_input, example_output)

    # Log model
    mlflow.pyfunc.log_model(
        artifact_path="summarization_model",
        python_model=wrapped_model,
        # signature=signature,
        registered_model_name="SummarizationModel",
        input_example=example_input,
    )

print("Model logged successfully.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Successfully registered model 'SummarizationModel'.
2025/01/02 01:48:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: SummarizationModel, version 1


🏃 View run initial summarization experiment-4 at: http://127.0.0.1:5000/#/experiments/938096856342841282/runs/8ff9be5569f24ec99f879d0428868625
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/938096856342841282
Model logged successfully.


Created version '1' of model 'SummarizationModel'.


# Test the Registered Model

In [44]:
model_uri = "models:/SummarizationModel/4"  # Update with your model name/version
model = mlflow.pyfunc.load_model(model_uri)

Downloading artifacts: 100%|██████████| 7/7 [00:09<00:00,  1.29s/it]


In [45]:
model_input = example_input

In [46]:
# model_input = {"input_paragraph": para1}
summary = model.predict(model_input)

In [47]:
summary

['The Reddit dataset contains 232,965 posts with an average degree of 492.',
 'Manav is writing his own bibiliography.']

# Serve the Model on SageMaker

In [15]:
# Necessary details
# instance_type='ml.g4dn.2xlarge' 
# region = 'us-east-1' 
# model_uri = "s3://mlflow-experiments-663/938096856342841282/8ff9be5569f24ec99f879d0428868625/artifacts/summarization_model/"  # Replace with your actual S3 path where the model is stored
    # role="arn:aws:iam::253490781721:role/SageMakerForPaperPal",  # Provide the IAM Role ARN

    # initial_instance_count=1,  # Number of instances to deploy


In [12]:
import mlflow
import mlflow.sagemaker
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel

from mlflow.deployments import get_deploy_client

In [15]:


# vpc_config = {
#     "SecurityGroupIds": [
#         "sg-123456abc",
#     ],
#     "Subnets": [
#         "subnet-123456abc",
#     ],
# }
config = dict(
    # assume_role_arn="arn:aws:iam::253490781721:user/PaperPal",
    execution_role_arn="arn:aws:iam::253490781721:role/SageMakerForPaperPal",
    bucket_name="mlflow-experiments-663",
    image_url = "253490781721.dkr.ecr.us-east-1.amazonaws.com/paperpal-mlflow-models:latest",
    region_name="us-east-1",
    archive=False,
    instance_type="ml.g4dn.2xlarge",
    instance_count=1,
    synchronous=True,
    timeout_seconds=600,
    # vpc_config=vpc_config,
    variant_name="prod-variant-1",
    env={"DISABLE_NGINX": "true", "GUNICORN_CMD_ARGS": '"--timeout 60"'},
    tags={"training_timestamp": "2022-11-01T05:12:26"},
)
client = get_deploy_client("sagemaker")
client.create_deployment(
    "summariation-model-deployment",
    model_uri = "s3://mlflow-experiments-663/938096856342841282/8ff9be5569f24ec99f879d0428868625/artifacts/summarization_model/",
    # flavor="python_function",
    config=config,
)

Downloading artifacts:  86%|████████▌ | 6/7 [00:13<00:00,  7.83it/s]

Downloading artifacts: 100%|██████████| 7/7 [01:34<00:00, 13.50s/it]
2025/01/03 07:09:22 INFO mlflow.sagemaker: Using the python_function flavor for deployment!
2025/01/03 07:09:23 INFO mlflow.sagemaker: No model data bucket specified, using the default bucket


2025/01/03 07:09:25 INFO mlflow.sagemaker: Default bucket `mlflow-sagemaker-us-east-1-253490781721` already exists. Skipping creation.


2025/01/03 07:10:31 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': '9M36A86S62VAD48Y', 'HostId': 'uNmBpTFXsOXIxgHE0dn1nUQcuMj/skhuE9QtBGocp45y1EE0olvDLh/Xbf5RxYT5ZRFMNEoKBknniZYjCr2QwxEYcxBX+Zf602KeHp57gqg=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'uNmBpTFXsOXIxgHE0dn1nUQcuMj/skhuE9QtBGocp45y1EE0olvDLh/Xbf5RxYT5ZRFMNEoKBknniZYjCr2QwxEYcxBX+Zf602KeHp57gqg=', 'x-amz-request-id': '9M36A86S62VAD48Y', 'date': 'Fri, 03 Jan 2025 12:10:32 GMT', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}
2025/01/03 07:10:31 INFO mlflow.sagemaker: Creating new endpoint with name: summariation-model-deployment ...


In [11]:
import pandas as pd
from mlflow.deployments import get_deploy_client

df = pd.DataFrame({"input_paragraph":[para1]})
client = get_deploy_client("arn:aws:iam::253490781721:role/SageMakerForPaperPal")
client.predict("summariation-model-deployment", df)
