In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from transformers.utils import logging

logging.set_verbosity(40)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model from Hugging Face using the transformers library
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")



In [3]:
pipe = pipeline(
  "summarization", 
  model=model, 
  tokenizer=tokenizer, 
  max_new_tokens=1024, 
  device_map='auto', 
  truncation=True
)

In [4]:
text_to_summarize="""Barrington DeVaughn Hendricks (born October 22, 1989), known professionally as JPEGMafia (stylized in all caps), is an American rapper, singer, and record producer born in New York City and based in Baltimore, Maryland. His 2018 album Veteran, released through Deathbomb Arc, received widespread critical acclaim and was featured on many year-end lists. It was followed by 2019's All My Heroes Are Cornballs and 2021's LP!, released to further critical acclaim."""

pipe(text_to_summarize)

[{'summary_text': 'Mafia is an American rapper, singer, and record producer.'}]

In [5]:
from datasets import load_dataset
from transformers import pipeline

xsum_dataset = load_dataset(
    "xsum", version="1.2.0"
)

In [6]:
xsum_sample = xsum_dataset["train"].select(range(10))
display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


In [7]:
import pandas as pd
batch_classification_results = pipe(xsum_sample["document"], num_beams=10)

joined_data = pd.DataFrame.from_dict(batch_classification_results)\
    .rename({"summary_test": "model_summary"}, axis=1)\
    .join(pd.DataFrame.from_dict(xsum_sample))

In [8]:
display(joined_data[["document", "summary_text", "summary"]])

Unnamed: 0,document,summary_text,summary
0,"The full cost of damage in Newton Stewart, one...",A clean-up operation is under way in parts of ...,Clean-up operations are continuing across the ...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed in a sus...,Two tourist buses have been destroyed by fire ...
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton beat Mercedes team-mate Nico Ro...,Lewis Hamilton stormed to pole position at the...
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer has gone ...,A former Lincolnshire Police officer carried o...
4,Patients and staff were evacuated from Cerahpa...,Turkish police have ended a stand-off with an ...,An armed man who locked himself into a room at...
5,Simone Favaro got the crucial try with the las...,Glasgow moved up to second in the Pro12 table ...,Defending Pro12 champions Glasgow Warriors bag...
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man police want to trace in connection with ...,A man with links to a car that was involved in...
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe has called for a speed...,Welsh cyclist Luke Rowe says changes to the sp...
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,Manchester City midfielder Ilkay Gundogan says...
9,The crash happened about 07:20 GMT at the junc...,The Independent Police Complaints Commission (...,A jogger has been hit by an unmarked police ca...


In [12]:
import mlflow
from mlflow.models import infer_signature
from mlflow.transformers import generate_signature_output
"""
For LLMs, we need to generate a model signature: https://mlflow.org/docs/latest/models.html#model-signature-and-input-example
Model signatures show the expected input and output types for a model. Which makes quality assurance for downstream serving easier
"""
#use our original text as an example input
data = text_to_summarize
#generate a summary for the output example
output = generate_signature_output(pipe, data)
#infer the signature based on model inputs and outputs
signature = infer_signature(data, output)

In [15]:
experiment_name = f"genai-intro-workshop"
mlflow.set_experiment(experiment_name)

#set the name of our model
model_name = "jpeg-mafia"

#get experiment id to pass to the run
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
with mlflow.start_run(experiment_id=experiment_id):
  mlflow.transformers.log_model(pipe, model_name, signature=signature, input_example=data)

2024/05/15 22:35:48 INFO mlflow.tracking.fluent: Experiment with name 'genai-intro-workshop' does not exist. Creating a new experiment.
  mlflow.transformers.log_model(pipe, model_name, signature=signature, input_example=data)
  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


[]

In [23]:
run_id = mlflow.search_runs([experiment_id]).iloc[0]['run_id']

In [27]:
run_id

'081c3469aab14302be87b6ba7efeaab2'

In [25]:
mlflow.register_model(f"runs:/{run_id}/model", 'test_model')

Successfully registered model 'test_model'.
Created version '1' of model 'test_model'.


<ModelVersion: aliases=[], creation_timestamp=1715834483607, current_stage='None', description=None, last_updated_timestamp=1715834483607, name='test_model', run_id='081c3469aab14302be87b6ba7efeaab2', run_link=None, source='file:///home/smckean/Repos/microrag/notebooks/mlruns/296287261287392225/081c3469aab14302be87b6ba7efeaab2/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [28]:
mlflow.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1715834483605, description=None, last_updated_timestamp=1715834483607, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1715834483607, current_stage='None', description=None, last_updated_timestamp=1715834483607, name='test_model', run_id='081c3469aab14302be87b6ba7efeaab2', run_link=None, source='file:///home/smckean/Repos/microrag/notebooks/mlruns/296287261287392225/081c3469aab14302be87b6ba7efeaab2/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='test_model', tags={}>]

In [32]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import pdf_loader

ImportError: cannot import name 'pdf_loader' from 'langchain.document_loaders' (/home/smckean/Repos/microrag/microrag_env/lib/python3.10/site-packages/langchain/document_loaders/__init__.py)