In [None]:
!pip install transformers



In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
import torch
import json
import tensorflow as tf
import numpy as np
import os
from google.colab import drive

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

FUNCTION TO READ QUERIES OF TVSUM & QV HIGHLIGHTS FROM JSONL FILES

In [None]:
def get_query(path):

  # Initialize an empty list to store queries
  queries_list = []

  # Reading each line from the JSONL file and extracting the "query" field
  with open(path, 'r') as jsonl_file:
      for line in jsonl_file:
          json_data = json.loads(line)
          query = json_data.get("query", "")
          queries_list.append(query)

  return queries_list

In [None]:
# Path to JSONL files
qv_file_path = '/content/sample_data/highlight_train_release.jsonl'
tvsum_file_path = '/content/sample_data/tvsum_train.jsonl'

qv_queries = get_query(qv_file_path)
tvsum_query = get_query(tvsum_file_path)

QUERIES BEFORE PREPROCESSING

In [None]:
print("QV HIGHLIGHTS DATASET QUERIES\n")
for i in range(5):
  print(qv_queries[i])

QV HIGHLIGHTS DATASET QUERIES

some military patriots takes us through their safety procedures and measures.
Man in baseball cap eats before doing his interview.
A man in a white shirt discusses the right to have and carry firearms.
A view of a bamboo fountain of water in a tea house and people scoop from and wash off
The weather map shows large snowfall in the weather patterns.


In [None]:
print("TVSUM DATASET QUERIES\n")
for i in range(5):
  print(tvsum_query[i])

TVSUM DATASET QUERIES

Electric cars making earth more green
The stuck truck of Mark, The rut that filled an afternoon.
#453 girl gets van stuck in the back fourty [Davidsfarm]
Smart Electric Vehicle Balances on Two Wheels
ŠKODA Tips How to Repair Your Tyre


PREPROCESSING FUNCTIONS

In [None]:
# Function to remove extra spaces from a sentence
def remove_extra_spaces(sentence):
    return re.sub(' +', ' ', sentence.strip())

# Function to remove non-alphanumeric characters from a string
def remove_non_alphanumeric(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Function to remove stopwords
def remove_stopwords(text):
    stopword_pattern = r'\b(?:' + '|'.join(stopwords.words('english')) + r')\b'
    return re.sub(stopword_pattern, '', text, flags=re.IGNORECASE)

In [None]:
# Cleaning qv queries
cleaned_list = [remove_non_alphanumeric(x.lower()) for x in qv_queries]
qv_cleaned_query = [remove_extra_spaces(remove_stopwords(x)) for x in cleaned_list]

# Cleaning tvsum queries
cleaned_listt = [remove_non_alphanumeric(x.lower()) for x in tvsum_query]
tvsum_cleaned_query = [remove_extra_spaces(remove_stopwords(x)) for x in cleaned_listt]

QUERIES AFTER PREPROCESSING

In [None]:
print("QV HIGHLIGHTS DATASET QUERIES\n")
for i in range(5):
  print(qv_cleaned_query[i])

QV HIGHLIGHTS DATASET QUERIES

military patriots takes us safety procedures measures
man baseball cap eats interview
man white shirt discusses right carry firearms
view bamboo fountain water tea house people scoop wash
weather map shows large snowfall weather patterns


In [None]:
print("TVSUM DATASET QUERIES\n")
for i in range(5):
  print(tvsum_cleaned_query[i])

TVSUM DATASET QUERIES

electric cars making earth green
stuck truck mark rut filled afternoon
453 girl gets van stuck back fourty davidsfarm
smart electric vehicle balances two wheels
škoda tips repair tyre


LOADING PRETRAINED DISTIL BERT MODEL

In [None]:
# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained DistilBERT model and tokenizer onto the specified device
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

FUNCTION TO ENCODE QUERIES

In [None]:
# Function to encode a list of queries into embeddings
def encode_queries(queries, max_length=64):
    # Tokenizing queries
    inputs = tokenizer(queries, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Forward pass through the model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Return the embeddings
    return outputs.last_hidden_state

In [None]:
# Encoding queries into embeddings
query_embeddings_qv = encode_queries(qv_cleaned_query)
query_embeddings_tvsum = encode_queries(tvsum_cleaned_query)

SHAPE OF EMBEDDINGS

In [None]:
query_embeddings_qv.shape

torch.Size([7218, 24, 768])

In [None]:
query_embeddings_tvsum.shape

torch.Size([40, 16, 768])

PRINTING EMBEDDINGS

In [None]:
query_embeddings_qv

tensor([[[-7.6184e-02,  1.1112e-02, -1.3600e-01,  ..., -1.0188e-01,
           1.5395e-01,  2.7936e-01],
         [ 2.4744e-01,  1.2378e-01, -6.3289e-02,  ...,  5.2794e-02,
           2.3613e-01,  8.0537e-02],
         [ 3.2228e-01,  4.6505e-02, -7.1740e-02,  ..., -9.7964e-02,
          -3.0228e-01,  8.0934e-02],
         ...,
         [ 1.2490e-01,  1.9378e-02, -1.8220e-01,  ..., -1.1144e-02,
          -2.3461e-01,  8.2949e-02],
         [ 1.1831e-01, -6.2375e-03, -1.8143e-01,  ..., -5.8838e-03,
          -2.1834e-01,  8.8967e-02],
         [ 1.3496e-01, -3.1117e-03, -1.5251e-01,  ..., -1.2895e-02,
          -2.0935e-01,  1.1647e-01]],

        [[-1.5997e-01, -7.4076e-02, -1.1242e-01,  ..., -1.0303e-01,
           2.2145e-01,  2.4245e-01],
         [-1.5249e-01,  2.0897e-02,  1.2867e-02,  ...,  3.9140e-02,
           4.1866e-01,  4.7134e-02],
         [-1.6907e-01,  1.0239e-01, -1.5480e-01,  ...,  3.9209e-01,
           1.5580e-01, -9.4785e-02],
         ...,
         [ 2.4056e-01, -1

In [None]:
query_embeddings_tvsum

tensor([[[-1.1463e-01, -2.4503e-01, -7.3583e-02,  ..., -2.2703e-01,
           3.9777e-01,  9.6576e-02],
         [ 2.1972e-01, -8.2425e-04, -1.9157e-01,  ..., -1.8836e-01,
           7.1403e-01, -3.6244e-02],
         [ 6.8297e-01,  1.1305e-01,  1.7122e-01,  ..., -6.3623e-01,
          -1.0147e-01, -2.9381e-01],
         ...,
         [ 1.8203e-01, -1.5645e-01,  2.1031e-01,  ..., -2.1015e-02,
           3.1746e-02, -5.2149e-02],
         [ 1.5215e-01,  7.6693e-02,  3.1767e-01,  ..., -2.4171e-01,
           1.1689e-01,  1.3984e-01],
         [ 2.3081e-01, -1.4215e-01,  2.4024e-01,  ...,  2.0259e-02,
           2.0647e-02, -9.9532e-02]],

        [[-2.0362e-01, -1.0830e-01,  4.6079e-02,  ..., -2.1183e-01,
           3.1933e-01,  2.1295e-01],
         [ 2.5503e-01,  2.2251e-01,  3.6445e-01,  ..., -1.1240e-01,
           3.8897e-01, -4.2260e-01],
         [ 2.5391e-01,  7.6379e-02,  1.7812e-01,  ..., -1.5169e-01,
           1.2607e-01, -3.0994e-01],
         ...,
         [ 7.9275e-02,  6

TESTING FOR USER QUERY

In [None]:
# User Input

user_query = input("Enter your query: ")

Enter your query: Dog is running Behind the man..!!


In [None]:
cleaned_user_query = remove_extra_spaces(remove_stopwords(remove_non_alphanumeric(user_query.lower())))

In [None]:
cleaned_user_query

'dog running behind man'

In [None]:
embedding = encode_queries(cleaned_user_query)

In [None]:
embedding = embedding.to('cuda:0')
embedding

tensor([[[-0.1737, -0.0408, -0.1624,  ..., -0.1518,  0.2350,  0.1653],
         [ 0.3092,  0.1917, -0.0779,  ..., -0.3535,  0.2289,  0.2688],
         [ 0.1654, -0.3956,  0.0141,  ..., -0.1729, -0.0261,  0.1421],
         [ 0.2886,  0.1729,  0.2959,  ..., -0.2012, -0.0699,  0.1844],
         [-0.4037, -0.1508, -0.2396,  ..., -0.1181, -0.0645, -0.1453],
         [ 0.9185,  0.2805, -0.4009,  ...,  0.0036, -0.4495, -0.2942]]],
       device='cuda:0')

CODE FOR UI

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to the directory
directory_path = "/content/drive/My Drive/FYP/input/"

# Listing all files in the directory
files = os.listdir(directory_path)

latest_file = max(files, key=lambda x: os.path.getmtime(os.path.join(directory_path, x)))

# Constructing the full path to the latest file
file_path = os.path.join(directory_path, latest_file)

# Reading
with open(file_path, "r") as file:
    content = file.read()

print(content)

asdf


In [None]:
# User Input

user_query = input("Enter your query")

Enter your queryhi


In [None]:
cleaned_query = remove_extra_spaces(remove_stopwords(remove_non_alphanumeric(content.lower())))

In [None]:
ui_embedding = encode_queries(cleaned_query)

In [None]:
chunk = ui_embedding[0][0][0:4]
chunk

tensor([-0.2218, -0.1646, -0.1281, -0.1908], device='cuda:0')

In [None]:
numpy_embedding = chunk.cpu().numpy()

#  Flatten the 3D array to a 2D array
flattened_embedding = numpy_embedding.reshape(-1, numpy_embedding.shape[-1])

# define output directory
output_directory = '/content/drive/MyDrive/FYP/output/'

# Get counter from CounterFile in FYP folder
counter_file = '/content/drive/MyDrive/FYP/counterfile.txt'
with open(counter_file, 'r') as file:
    try:
        counter = int(file.read().strip())
        print("Successfully read integer from file:", counter)

    except ValueError:
        print("Error: The file does not contain a valid integer.")

# Specify the file path for the current embedding
title = "embedding_" + str(counter) + ".txt"
file_path = os.path.join(output_directory, title)

# Save the embedding to the file
np.savetxt(file_path, flattened_embedding)

# Update counter++ from CounterFile in FYP folder
with open(counter_file, 'w') as file:
    counter = counter + 1
    file.write(str(counter))

Successfully read integer from file: 15
