In [1]:
%%capture
!pip install --upgrade git+https://github.com/UKPLab/sentence-transformers
!pip install keybert ctransformers[cuda]
!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install PyPDF2
!pip install pandas
!pip install re

In [2]:
import PyPDF2
import pandas as pd
import re

with open('An Experimental Evaluation on Deepfake Detection using Deep Face Recognition.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    data = {'Page Number': [], 'Text': []}
    all_text = ""

    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            data['Page Number'].append(i + 1)
            data['Text'].append(text)
            all_text += text + " "
        else:
            data['Page Number'].append(i + 1)
            data['Text'].append("")

df = pd.DataFrame(data)

In [3]:
def group_text_by_pages(df, pages_per_group=1):
    grouped_texts = []
    total_pages = len(df)

    for start in range(0, total_pages, pages_per_group):
        end = start + pages_per_group
        grouped_text = " ".join(df['Text'][start:end])

        grouped_text = re.sub(r'[^\x00-\x7F]+', ' ', grouped_text)
        grouped_text = re.sub(r"[\"'‘’]", '', grouped_text)
        grouped_text = re.sub(r'\s+', ' ', grouped_text).strip()

        grouped_texts.append(grouped_text)

    return grouped_texts


grouped_texts_list = group_text_by_pages(df, 1)

In [4]:
from ctransformers import AutoModelForCausalLM,AutoConfig

config = AutoConfig.from_pretrained("TheBloke/Mistral-7B-v0.1-GGUF")
config.config.max_new_tokens = 2048
config.config.context_length = 4096


# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=200,
    device_map="auto",
    hf=True,
    config=config
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.1.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [5]:
from transformers import AutoTokenizer, pipeline
from huggingface_hub import login

login(token="---")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# Pipeline
generator = pipeline(
    model=model.to('cuda'), tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=50,
    repetition_penalty=1.1
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [6]:
example_prompt = """
<s>[INST]
I have the following document:
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST] meat, beef, eat, eating, emissions, steak, food, health, processed, chicken</s>"""

In [7]:
keyword_prompt = """
[INST]

I have the following document:
- [DOCUMENT]

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST]
"""

In [8]:
prompt = example_prompt + keyword_prompt
print(prompt)


<s>[INST]
I have the following document:
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST] meat, beef, eat, eating, emissions, steak, food, health, processed, chicken</s>
[INST]

I have the following document:
- [DOCUMENT]

Please give me the keywords that are present in this document and separate them with commas.
Make sure you to only return the keywords and say nothing else. For example, don't say:
"Here are the keywords present in the document"
[/INST]


In [9]:
from keybert.llm import TextGeneration
from keybert import KeyLLM

# Load it in KeyLLM
llm = TextGeneration(generator, prompt=prompt)
kw_model = KeyLLM(llm)

In [10]:
keywords_list = []
for grouped_text in grouped_texts_list:
    keywords = kw_model.extract_keywords(grouped_text);
    keywords_list = keywords_list + keywords
    print('OK')
keywords_list

OK
OK
OK
OK
OK
OK


[['deepfakes',
  'deep learning',
  'computer vision',
  'biometrics',
  'face recognition',
  'synthesized media',
  'misinformation',
  'political threat',
  'social threat',
  'security risk',
  'AI threat',
  'facial modulations',
  'data-driven deep learning',
  'biometric anti'],
 ['high performance',
  'intra-dataset evaluation',
  'AUC',
  '0.99',
  'cross-dataset evaluation',
  'AUC',
  '0.70',
  'media articles',
  'biometric technology',
  'deepfakes',
  'identity',
  'swapping',
  'facial recognition'],
 ['NeuralTextures',
  'patch-based',
  'GAN-loss',
  'mouth-related',
  'facial expression',
  'deepfake',
  'Celeb-DF',
  'FaceForensics++',
  'identity swapping',
  'FaceSwap',
  'FaceShifter',
  'Deepf'],
 ['FaceSwap',
  'Deepfakes',
  'NeuralTextures',
  'MS1M-AF',
  'WebFace12M',
  'AUC',
  'EER',
  'ResNet-50',
  'FaceForensics++',
  'Celeb-DF',
  'Face-'],
 ['Xray model',
  'Celeb-DF dataset',
  'Face-Xray model',
  'behavioral biometrics',
  'facial expression',
  'h

In [11]:
k = []
for i in keywords_list:
    k = k+i
    
combined_list = list(set(k)) 

def find_terms_in_pages(df, terms):
    term_pages = {}

    for term in terms:
        found_pages = []
        for index, row in df.iterrows():
            if re.search(r'\b' + re.escape(term) + r'\b', str(row['Text']), re.IGNORECASE):
                if row['Page Number'] not in found_pages:
                    found_pages.append(row['Page Number'])
        
        # Add to the dictionary only if pages were found
        if found_pages:
            term_pages[term] = found_pages

    return term_pages

terms_pages = find_terms_in_pages(df, combined_list)

for term, pages in terms_pages.items():
    print(f"{term}: {pages}")

misinformation: [1]
cross-dataset evaluation: [2]
facial recognition: [1, 2, 5]
EER: [1, 3, 4, 5]
intra-dataset evaluation: [2]
Face-Xray model: [5]
ResNet-50: [3, 4]
GAN-loss: [3]
deep learning: [1]
identity: [2, 3, 4, 5, 6]
expression swapping: [1, 2, 4, 5]
deepfake: [1, 2, 3, 4, 5, 6]
identity swapping: [2, 3, 4, 5]
Celeb-DF dataset: [1, 5]
AUC: [1, 2, 3, 4, 5]
head-pose movement: [5]
WebFace12M: [3, 4, 5]
biometric technology: [5]
mouth-related: [3]
Deepfakes: [1, 2, 3, 4, 5, 6]
Face2Face: [1, 2, 3, 4, 5, 6]
deepfakes: [1, 2, 3, 4, 5, 6]
biometrics: [1, 2, 5, 6]
face recognition: [1, 2, 3, 4, 5, 6]
Face-: [4, 5, 6]
AI threat: [1, 6]
facial expression: [1, 2, 5]
FaceSwap: [1, 2, 3, 4, 5]
deepfake detection: [1, 2, 3, 4, 5, 6]
MS1M-AF: [4]
Celeb-DF: [1, 3, 4, 5, 6]
computer vision: [1, 6]
high performance: [2]
Xray model: [5]
patch-based: [3]
behavioral biometrics: [2, 5]
NeuralTextures: [1, 2, 3, 4, 5]
media articles: [2]
swapping: [1, 2, 3, 4, 5, 6]
synthesized media: [1]
FaceShift