### Step 1: Process the data and create the Sentence Transformer model.

In [9]:
import os
import json
import pandas as pd
import numpy as np

### Import data and prepare it for model

In [None]:
def load_jsons_to_dataframe(folder_path):
    data = []
    unique_id = 0
    
    # Iterate through the folder containing the JSON files
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            filepath = os.path.join(folder_path, filename)
            
            # Open each JSON file and read the data
            with open(filepath, 'r') as file:
                json_content = json.load(file)
                json_content['unique_id'] = unique_id
                data.append(json_content)
                unique_id += 1

    # Convert the list of JSON objects to a Pandas DataFrame
    df = pd.DataFrame(data)
    
    return df

folder_path = './datasets/output_rabee_gheni/' # Replace this with the path to your folder
df = load_jsons_to_dataframe(folder_path)
#print(df)


In [14]:
file_name = './datasets/output_rabee_gheni/dataset.csv'
df.to_csv(file_name, index=False)

In [5]:
df.columns

Index(['Title', 'Source', 'Content', 'DateOfScrapping', 'unique_id'], dtype='object')

In [6]:
text_features = ['Title', 'Source', 'Content', 'DateOfScrapping', 'unique_id']
df['metadata'] = df[text_features].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)


### Model

In [7]:
# SentenceTransformers is a Python framework for state-of-the-art sentence, text, and image embeddings.
# The all-MiniLM-L6-v2 model maps sentences & paragraphs to a 384-dimensional dense vector space
# More sentence-transforming models can be found at: https://huggingface.co/sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [8]:
# get embeddings
sentence_embeddings = model.encode(df['metadata'].tolist())

In [10]:
np.save("embeddings_matrix_v1", sentence_embeddings)

### Cosine similarities

In [11]:
final_matrix = np.load("embeddings_matrix_v1.npy")

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

chunk = 10000 # This can be modified as per needs. Rigth now it is not necessary but as the library grows, this may become usefull.
steps = int(final_matrix.shape[0] / chunk)+1
top_k = 10 # This can be updated as needed.
similarity_top_k = np.empty((final_matrix.shape[0],top_k), dtype="uint32")

for i in range(steps):
    if ((i+1)*chunk) > final_matrix.shape[0]:
        upper_bound = final_matrix.shape[0]
    else:
        upper_bound = (i+1)*chunk

    similarity_matrix = cosine_similarity(final_matrix[i*chunk:upper_bound], final_matrix)

    for j in range(similarity_matrix.shape[0]):
        similarity_top_k[i*chunk+j] = np.argsort(similarity_matrix[j])[::-1][:top_k]

In [13]:
np.save("embeddings_matrix_v1_similarities_top_k", similarity_top_k)