In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from missingno import matrix
DATA_PATH = Path() / "data"
DATA_PATH.mkdir(parents=True,exist_ok=True)

def load_data(filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    return pd.read_csv(csv_path,encoding=encoding)

def save_data(data, filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False,encoding='ISO-8859-1')

PLOT_PATH = Path() / "plot"
PLOT_PATH.mkdir(parents=True,exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300, transparent=True):
    path = PLOT_PATH / f"{fig_id}.{fig_extension}"
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, transparent=transparent)

In [None]:
!pip install --upgrade git+https://github.com/UKPLab/sentence-transformers
!pip install keybert ctransformers[cuda]
!pip install --upgrade git+https://github.com/huggingface/transformers

In [3]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer, pipeline

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# Pipeline
generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

In [None]:
prompt = """<s>[INST]
I have the following document:
- OpenAI‚Äôs mission is to ensure that artificial general intelligence benefits all of humanity. We‚Äôre hiring: https://t.co/dJGr6LgzPA

Please give me the keywords only about company name that are present in this document and separate them with commas. If this info is missing, please return N/A.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST]OpenAI</s>
[INST]

I have the following document:
- [DOCUMENT]

Please give me the keywords only about company name that are present in this document and separate them with commas. If this info is missing, please return N/A.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST]"""

In [None]:
documents = [
"CEO,  Google and Alphabet",
"Entrepreneur, philanthropist, UN Special Envoy for Climate Ambition & Solutions, WHO Amb. for NCDs & Injuries, mayor of NYC, father, grandfather, and data nerd.",
"OpenAI‚Äôs mission is to ensure that artificial general intelligence benefits all of humanity. We‚Äôre hiring: https://t.co/dJGr6LgzPA",
"CEO @SocialCapital CEO @hustle Bestie @theallinpod Learn with me: https://t.co/PSBNs9US6o",
"Chairman and CEO Dell Technologies Grateful #PlayNiceButWin https://t.co/3JiGiLPSc4"
]

In [None]:
from keybert.llm import TextGeneration
from keybert import KeyLLM

In [None]:
# Load it in KeyLLM
llm = TextGeneration(generator, prompt=prompt)
kw_model = KeyLLM(llm)

In [None]:
keywords = kw_model.extract_keywords(documents); keywords