If you don't have gpu, execute this notebook on Google Colab!

In [None]:
# !pip install datasets transformers bitsandbytes
# !pip install -U openai opencv-python moviepy
# !pip install peft
# !pip install flask pyngrok

Get ngrok authentication token

In [None]:
!ngrok authtoken YOUR_TOKEN_HERE

In [None]:
import os
import torch
from huggingface_hub.hf_api import HfFolder
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from flask import Flask, request, jsonify

In [None]:
HfFolder.save_token('YOUR TOKEN HERE')

Initialize and load model

In [None]:
def initialize_model(base_model_id, bnb_config):
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=bnb_config
    )

    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id,
        add_bos_token=True
    )

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    return model, tokenizer

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_id = "mistralai/Mistral-7B-v0.1"
model, tokenizer = initialize_model(base_model_id, bnb_config)

Load Summarizer

In [None]:
# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)


def summarizer_func(article: str):
    summary = summarizer(article, max_length=200, min_length=100, do_sample=False)

    return summary[0]["summary_text"] + "\n\n"

Create Flask endpoint

In [None]:
from pyngrok import ngrok
from flask import Flask, request, jsonify


app = Flask(__name__)

@app.route('/')
def home():
    return "Hello, this is a test API."

@app.route("/generate_response", methods=['POST'])
def generate_response():
    data = request.json
    prompt = data.get("prompt", "")
    max_new_tokens = data.get("max_new_tokens", 100)
    temperature = data.get("temperature", 0.7)

    model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
    model.eval()
    with torch.no_grad():
        generated_tokens = model.generate(
            **model_input,
            max_new_tokens=max_new_tokens,
            repetition_penalty=1.15,
            eos_token_id=tokenizer.eos_token_id,
            temperature=temperature
        )

    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True).strip()

    # Extract the part of the text after "Response"
    split_text = generated_text.split("Response", 1)

    if len(split_text) > 1:
        return jsonify({"response": split_text[1].strip()})
    else:
        return jsonify({"response": generated_text.strip()})


@app.route("/summarize_the_pdf", methods=['POST'])
def summarize_the_pdf():
    data = request.json
    prompt = data.get("prompt", "")

    response = summarizer_func(prompt)

    return jsonify({"response": response})

if __name__ == '__main__':
    # Setup ngrok tunnel
    public_url = ngrok.connect(5000)
    print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:5000\"".format(public_url))
    app.run()