<a href="https://colab.research.google.com/github/Stepsalong/starter-hugo-portfolio-theme/blob/main/research/Factcheck_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fact-checking database for dokopalsya.bot

The goal of this notebook is to set up a fact-checking database ready to be used on the hackathon to building a fact checking bot.

Plan -
1. Build a sustainable data retrieval pipeline from the google fact check api
2. Set-up the ETL in place to ensure high quality data flowing into pinecone
3. Choose the best embedding model
4. Using this model load the data into pinecone

## Set-up - Packages, Libraries & Functions

In [1]:
!pip install datasets



In [2]:
!pip install pinecone



In [3]:
import pandas as pd
import json
import requests
from datetime import datetime
import time
from google.colab import files
from google.colab import drive
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import numpy as np
import uuid

In [4]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
## function to convert our scrapped json files to pandas dataframes

In [6]:
def json_to_dataframe(json_data):
    # Extract claims from the JSON
    claims = json_data.get("claims", [])

    # Combine main fields and claimReview for each claim
    combined_data = []
    for claim in claims:
        main_data = {
            "text": claim.get("text"),
            "claimant": claim.get("claimant"),
            "claimDate": claim.get("claimDate")
        }
        for review in claim.get("claimReview", []):
            review_data = {
                "text": main_data["text"],
                "claimant": main_data["claimant"],
                "claimDate": main_data["claimDate"],
                "publisher_name": review.get("publisher", {}).get("name"),
                "publisher_site": review.get("publisher", {}).get("site"),
                "url": review.get("url"),
                "title": review.get("title"),
                "reviewDate": review.get("reviewDate"),
                "textualRating": review.get("textualRating"),
                "languageCode": review.get("languageCode")
            }
            combined_data.append(review_data)

    # Create a single DataFrame
    df = pd.DataFrame(combined_data)

    return df

In [7]:
# Define a function to check if a string is likely text
def is_likely_text(text):
  if not isinstance(text, str):
    return False
  if not text:  # Check for empty string
    return False
  if text.isdigit():  # Check if string contains only digits
    return False
  try:
      datetime.strptime(text, '%Y-%m-%d')  # Attempt to parse as date
      return False
  except ValueError:
      pass
  # Add more checks for other non-text patterns as needed
  return True

In [8]:
# API endpoint
url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

# API key
api_key = 'AIzaSyB7NjZt5iaKPMsOQuFwfecOkUByBBpq6Og'

In [9]:
pc = Pinecone(api_key="b97736e3-1462-49b6-921c-7e4412707993")

## Dataset Retrieval

### Dataset Retrieval API - Disabled!


In [10]:
# below is the result of our analysis - the list of the biggest factchecking sources verified by ICFN - international factcheking organisation

In [11]:
list_of_publishers = ['fullfact.org', 'snopes.com', 'newschecker.in',
       'factcheck.afp.com', 'factly.in', 'politifact.com',
       'factcheck.org', 'thequint.com',
       'vishvasnews.com', 'provereno.media', 'indiatoday.in',
       'newsmeter.in', 'science.feedback.org',
       'usatoday.com', 'en.youturn.in',
       'apnews.com', 'stopfake.org', 'leadstories.com',
       'checkyourfact.com', 'voxukraine.org', 'rappler.com',
       'srilanka.factcrescendo.com', 'africacheck.org',
       'logicallyfacts.com', 'aap.com.au', 'verafiles.org', 'boomlive.in']

In [12]:
# df_claims = pd.DataFrame()  # Creates a new empty DataFrame

# for publisher in list_of_publishers:
#   for language in ['uk', 'ru', 'en']:
#     # Query parameters
#     params = {
#         "maxAgeDays": 1095,
#         "pageSize": 10000,  # Set to max size for fewer paginated requests
#         "key": api_key,
#         "reviewPublisherSiteFilter": publisher,
#         "languageCode": language,
#     }

#     # Make initial request
#     response = requests.get(url, params=params)
#     response_data = response.json()
#     df = json_to_dataframe(response_data)
#     df['publisher_from_list'] = publisher
#     if len(df) > 0:  # Check if the DataFrame is not empty
#         df_claims = pd.concat([df_claims, df], ignore_index=True)

### Dataset Retrieval CSV

In [87]:
# Load the Augmented CSV file from Google Drive
df_claims = pd.read_csv('/content/drive/MyDrive/Oplot/Connecthack/claims_en_ru_ua_last_3_years_augmented250125.csv')

# Now you can work with df_claims
print(df_claims.head())

   Unnamed: 0                                               text  \
0           0  Footage shows a tsunami after a recent earthqu...   
1           1  A video shows Hamas fighters praying after the...   
2           2  “Record numbers” of people have been returned ...   
3           3  A video shows the perspective of a firefighter...   
4           4  Bovaer reduces the fertility of cattle and poi...   

        claimant             claimDate publisher_name publisher_site  \
0         Pubilc  2026-01-14T00:00:00Z      Full Fact   fullfact.org   
1  Facebook user  2025-01-15T00:00:00Z      Full Fact   fullfact.org   
2   Keir Starmer  2025-01-09T00:00:00Z      Full Fact   fullfact.org   
3         X User  2025-01-10T00:00:00Z      Full Fact   fullfact.org   
4  facebook user  2025-01-10T00:00:00Z      Full Fact   fullfact.org   

                                                 url  \
0  https://fullfact.org/online/miscaptioned-foota...   
1  https://fullfact.org/online/video-hamas-fig

In [26]:
# # link is publicly available
# url = "https://drive.google.com/uc?id=12ViKWF_1ammeN0nUW7mid6d1QBR6qXDc"
# try:
#     df_claims = pd.read_csv(url)
#     print("CSV file imported successfully!")
#     print(df_claims.head())
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")

CSV file imported successfully!
                            primary_key  \
0  dc6e9db4-4bd7-4f12-abba-da0c1a34868f   
1  69c0271b-12a5-45ae-880c-7a943f83c41e   
2  c0c1e0bf-b7d8-4ee5-bac7-90bca4e80ad1   
3  ba1d8107-09d2-42da-8be7-5cae63d3c674   
4  3219b40b-c3a5-4df2-b3a5-160a83b665fb   

                                                text       claimant  \
0  A video shows Hamas fighters praying after the...  Facebook user   
1  Footage shows a tsunami after a recent earthqu...         Pubilc   
2  The manufacturer of Bovaer has also invested i...  facebook user   
3  The government says 16,400 failed asylum seeke...  Daily Express   
4  “Record numbers” of people have been returned ...   Keir Starmer   

              claimDate publisher_name publisher_site  \
0  2025-01-15T00:00:00Z      Full Fact   fullfact.org   
1  2026-01-14T00:00:00Z      Full Fact   fullfact.org   
2  2025-01-10T00:00:00Z      Full Fact   fullfact.org   
3  2025-01-09T00:00:00Z      Full Fact   fullfact.org 

In [88]:
df_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52640 entries, 0 to 52639
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                52640 non-null  int64 
 1   text                      52640 non-null  object
 2   claimant                  52640 non-null  object
 3   claimDate                 47219 non-null  object
 4   publisher_name            52640 non-null  object
 5   publisher_site            52640 non-null  object
 6   url                       52640 non-null  object
 7   title                     52640 non-null  object
 8   reviewDate                46033 non-null  object
 9   textualRating             52640 non-null  object
 10  languageCode              52640 non-null  object
 11  publisher_from_list       52640 non-null  object
 12  explanation               45615 non-null  object
 13  simplified_textualRating  47754 non-null  object
 14  text_en               

## Dataset Quality Control

### Prefiltering

In [89]:
## lets make sure that only major publishers in the required languages are present

In [90]:
df_claims = df_claims[df_claims['languageCode'].isin(['en', 'ru', 'uk'])]

In [91]:
publisher_counts = df_claims['publisher_site'].value_counts()

# Filter out publishers with less than 450 occurrences
publishers_to_keep = publisher_counts[publisher_counts >= 450].index

# Filter the original DataFrame
df_claims = df_claims[df_claims['publisher_site'].isin(publishers_to_keep)]

# Now filtered_df contains only the rows where the publisher_site appears at least 500 times.
len(df_claims)

52640

### Identification and removal of data of incorrect type

In [92]:
df_claims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52640 entries, 0 to 52639
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                52640 non-null  int64 
 1   text                      52640 non-null  object
 2   claimant                  52640 non-null  object
 3   claimDate                 47219 non-null  object
 4   publisher_name            52640 non-null  object
 5   publisher_site            52640 non-null  object
 6   url                       52640 non-null  object
 7   title                     52640 non-null  object
 8   reviewDate                46033 non-null  object
 9   textualRating             52640 non-null  object
 10  languageCode              52640 non-null  object
 11  publisher_from_list       52640 non-null  object
 12  explanation               45615 non-null  object
 13  simplified_textualRating  47754 non-null  object
 14  text_en               

In [93]:
str_columns_must_have = ['text', 'publisher_site', 'title', 'textualRating', 'languageCode']

In [94]:
for column in str_columns_must_have:
  # Instead of filtering, display rows that don't meet the criteria
  print(f"Rows in '{column}' that are not likely text:")
  print(df_claims[~df_claims[column].apply(is_likely_text)])
  print("-" * 50) # Separator for better readability

Rows in 'text' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------
Rows in 'publisher_site' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------
Rows in 'title' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------
Rows in 'textualRating' th

In [95]:
for column in str_columns_must_have:
  df_claims = df_claims[df_claims[column].apply(is_likely_text)]
len(df_claims)

52640

In [96]:
str_columns_good_to_have = ['claimant', 'claimDate', 'publisher_name']

In [97]:
for column in str_columns_good_to_have:
  # Instead of filtering, display rows that don't meet the criteria
  print(f"Rows in '{column}' that are not likely text:")
  print(df_claims[~df_claims[column].apply(is_likely_text) & df_claims[column].notna()])
  print("-" * 50) # Separator for better readability

Rows in 'claimant' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------
Rows in 'claimDate' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------
Rows in 'publisher_name' that are not likely text:
Empty DataFrame
Columns: [Unnamed: 0, text, claimant, claimDate, publisher_name, publisher_site, url, title, reviewDate, textualRating, languageCode, publisher_from_list, explanation, simplified_textualRating, text_en]
Index: []
--------------------------------------------------


In [98]:
for column in str_columns_good_to_have:
  df_claims = df_claims[df_claims[column].apply(is_likely_text) | df_claims[column].isna()]
len(df_claims)

52640

In [99]:
date_columns = ['claimDate', 'reviewDate']

In [100]:
# Check for invalid dates in 'claimDate' and 'reviewDate' columns
for col in date_columns:
    invalid_dates = df_claims[
        ~pd.to_datetime(df_claims[col], errors='coerce').notna() & df_claims[col].notna()
    ]
    print(f"Rows with invalid or non-date values in '{col}':")
    print(invalid_dates[[col]])  # Display only the problematic column
    print("-" * 50)

Rows with invalid or non-date values in 'claimDate':
Empty DataFrame
Columns: [claimDate]
Index: []
--------------------------------------------------
Rows with invalid or non-date values in 'reviewDate':
Empty DataFrame
Columns: [reviewDate]
Index: []
--------------------------------------------------


In [101]:
# Remove rows with invalid dates in 'claimDate' and 'reviewDate' columns
for col in date_columns:
    df_claims = df_claims[
        pd.to_datetime(df_claims[col], errors='coerce').notna() | df_claims[col].isna()]

### Duplicate Removal

In [102]:
duplicate_counts = df_claims['text'].value_counts()
duplicate_texts = duplicate_counts[duplicate_counts > 1]
len(duplicate_texts)

285

In [103]:
duplicate_text_counts = df_claims.groupby('text')['publisher_site'].nunique()
duplicate_texts_with_same_publisher = duplicate_text_counts[duplicate_text_counts > 1]

# Sort the DataFrame by 'text' to group duplicates together
df_claims_sorted = df_claims.sort_values('text')

# Filter the DataFrame to include only rows with duplicated text and same publisher
duplicated_df = df_claims_sorted[df_claims_sorted['text'].isin(duplicate_texts_with_same_publisher.index)]

# Print the duplicated rows
len(duplicated_df)

112

In [104]:
# Convert 'reviewDate' to datetime
df_claims['reviewDate'] = pd.to_datetime(df_claims['reviewDate'])

# Remove duplicates, keeping the row with the latest 'reviewDate'
df_claims = (
    df_claims.sort_values(by="reviewDate", ascending=False)
    .drop_duplicates(subset=["text", "publisher_site"], keep="first")
)
len(df_claims)

52362

### Dataset Overview

In [105]:
df_claims.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52362 entries, 34539 to 50466
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Unnamed: 0                52362 non-null  int64              
 1   text                      52362 non-null  object             
 2   claimant                  52362 non-null  object             
 3   claimDate                 47115 non-null  object             
 4   publisher_name            52362 non-null  object             
 5   publisher_site            52362 non-null  object             
 6   url                       52362 non-null  object             
 7   title                     52362 non-null  object             
 8   reviewDate                45777 non-null  datetime64[ns, UTC]
 9   textualRating             52362 non-null  object             
 10  languageCode              52362 non-null  object             
 11  publisher_from_l

In [106]:
len(df_claims['publisher_site'].unique())

27

In [107]:
len(df_claims)

52362

In [108]:
len(df_claims['textualRating'].unique())

8549

In [109]:
df_claims['textualRating'].value_counts()

Unnamed: 0_level_0,count
textualRating,Unnamed: 1_level_1
False,22919
Misleading,2711
FALSE,2533
Fake,1200
True,1064
...,...
This is not true. Transcripts of the speech do not show any mention of Mr Schwab.,1
This is false. The man in the picture has a different name and died shortly before the post appeared.,1
"This is incorrect. The man who died has been named as Leonard Farruku, an Albanian national.",1
False. The claim is one of many fake news items shared among a suite of supposedly genuine news pages.,1


In [110]:
df_claims['languageCode'].value_counts()

Unnamed: 0_level_0,count
languageCode,Unnamed: 1_level_1
en,49116
ru,2442
uk,804


In [111]:
df_claims.describe()

Unnamed: 0.1,Unnamed: 0
count,52362.0
mean,26393.620163
std,15175.05759
min,0.0
25%,13280.25
50%,26416.5
75%,39536.75
max,52639.0


In [112]:
df_claims['publisher_site'].value_counts()

Unnamed: 0_level_0,count
publisher_site,Unnamed: 1_level_1
factcheck.afp.com,5384
snopes.com,4416
leadstories.com,3494
politifact.com,3425
usatoday.com,3243
factly.in,2916
thequint.com,2790
fullfact.org,2430
newsmeter.in,2162
boomlive.in,2157


In [113]:
df_claims[df_claims['languageCode'] == 'uk']['text']

Unnamed: 0,text
43115,"ФЕЙК: Українська поліція допитує вірян, які св..."
43114,НЕПРАВДА: Україну «накриває» новий метапневмов...
43113,ФЕЙК: В Україні з січня вводять погодинні граф...
43120,МАНІПУЛЯЦІЯ: США таємно озброювали Україну з в...
43119,НЕПРАВДА: Експорт електроенергії в Молдову спр...
...,...
37892,Документи про створення тероборони у Волновасі...
37884,Відеозапис візиту Зеленського до поранених вій...
37783,Росія не використовує курсантів та солдатів ст...
37888,Українські націоналісти взяли у полон 6000 іно...


### Create Index Column

### Export To CSV - Disabled!

In [15]:
# df_claims.to_csv('claims_en_ru_ua_last_3_years_filtered1234.csv', index=True)
# files.download('claims_en_ru_ua_last_3_years_filtered1234.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Convertion To Strings And Tackling NAs

In [114]:
# Rows with NaN in any column
df_claims_with_na = df_claims[df_claims.isna().any(axis=1)]

# Rows with no NaN in any column
df_claims_without_na = df_claims[~df_claims.isna().any(axis=1)]

In [115]:
df_claims_without_na.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32790 entries, 31264 to 30919
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Unnamed: 0                32790 non-null  int64              
 1   text                      32790 non-null  object             
 2   claimant                  32790 non-null  object             
 3   claimDate                 32790 non-null  object             
 4   publisher_name            32790 non-null  object             
 5   publisher_site            32790 non-null  object             
 6   url                       32790 non-null  object             
 7   title                     32790 non-null  object             
 8   reviewDate                32790 non-null  datetime64[ns, UTC]
 9   textualRating             32790 non-null  object             
 10  languageCode              32790 non-null  object             
 11  publisher_from_l

In [117]:
df_claims = df_claims_without_na

In [118]:
df_claims[date_columns] = df_claims[date_columns].apply(pd.to_datetime).astype(int) // 10**9

In [120]:
# Generate UUIDv7 for each row in the DataFrame
df_claims['primary_key'] = [str(uuid.uuid4()) for _ in range(len(df_claims))]

# Optional: Ensure 'primary_key' is set as the index if required
df_claims.set_index('primary_key', inplace=True)

In [121]:
df_claims.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32790 entries, 1572578d-1e37-4e9d-a98a-dd8c2b655449 to 067dbe0d-6ca2-43f3-b969-39f224a90120
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Unnamed: 0                32790 non-null  int64 
 1   text                      32790 non-null  object
 2   claimant                  32790 non-null  object
 3   claimDate                 32790 non-null  int64 
 4   publisher_name            32790 non-null  object
 5   publisher_site            32790 non-null  object
 6   url                       32790 non-null  object
 7   title                     32790 non-null  object
 8   reviewDate                32790 non-null  int64 
 9   textualRating             32790 non-null  object
 10  languageCode              32790 non-null  object
 11  publisher_from_list       32790 non-null  object
 12  explanation               32790 non-null  object
 13  simplified_text

## Vector Database

### Creating Embeddings

In [122]:
# Load the model
# model = SentenceTransformer('intfloat/multilingual-e5-small')
model_stella_en_400M_v5 = SentenceTransformer('billatsectorflow/stella_en_400M_v5', trust_remote_code=True)

# Another model we tried -
# model = SentenceTransformer('intfloat/multilingual-e5-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at billatsectorflow/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertF

In [129]:
# Generate embeddings with progress tracking
def generate_embeddings(df, text_column="text_en", batch_size=64):
    # Add the required prefix ("passage: ") to the text
    texts = [f"passage: {text}" for text in df[text_column]]
    embeddings = []

    # Process texts in batches and show progress bar
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i+batch_size]
        batch_embeddings = model_stella_en_400M_v5.encode(batch, normalize_embeddings=True)
        embeddings.extend(batch_embeddings)

    return embeddings

# Apply embeddings to the "text" column
df_claims["embedding"] = list(generate_embeddings(df_claims, text_column="text_en"))

Generating embeddings: 100%|██████████| 513/513 [01:52<00:00,  4.57it/s]


### Delete Pinecone Index

In [124]:
pc.delete_index('connecthack-stella-en-400m-v5-dev')

### Create Pinecone Index - Disabled

In [125]:
pc.create_index(name='connecthack-stella-en-400m-v5-dev', dimension=1024, spec=ServerlessSpec(cloud='aws',region='us-east-1'))

### Choose Pinecone Index

In [126]:
index = pc.Index('connecthack-stella-en-400m-v5-dev')

In [127]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [130]:
df_claims.columns

Index(['Unnamed: 0', 'text', 'claimant', 'claimDate', 'publisher_name',
       'publisher_site', 'url', 'title', 'reviewDate', 'textualRating',
       'languageCode', 'publisher_from_list', 'explanation',
       'simplified_textualRating', 'text_en', 'embedding'],
      dtype='object')

### Load Data To Pinecone

In [133]:
# Prepare data for Pinecone
def prepare_pinecone_data(df):
    data = []
    for idx, row in df.iterrows():
        metadata = {
            "text": row["text"],
            "claimant": row["claimant"],
            "claimDate": str(row["claimDate"]),  # Convert to string
            "publisher_name": row["publisher_name"],
            "publisher_site": row["publisher_site"],
            "url": row["url"],
            "title": row["title"],
            "reviewDate": str(row["reviewDate"]),  # Convert to string
            "textualRating": row["textualRating"],
            "languageCode": row["languageCode"],
            "text_en": row["text_en"],
            "simplified_textualRating": row["simplified_textualRating"]
        }
        data.append((str(idx), row["embedding"], metadata))
    return data

# Upload data
pinecone_data = prepare_pinecone_data(df_claims)

# Set the batch size
batch_size = 100

# Add progress bar
for i in tqdm(range(0, len(pinecone_data), batch_size), desc="Uploading to Pinecone", unit="batch"):
    batch = pinecone_data[i:i + batch_size]
    index.upsert(vectors=batch)

Uploading to Pinecone: 100%|██████████| 328/328 [02:58<00:00,  1.84batch/s]


### Define Query Function

In [16]:
def compare_search_results(query_text, index_model_pairs):
    """
    Runs the search_similar_claims function over multiple index-model pairs and formats the results for easy comparison.

    Args:
        query_text (str): The input query text to search for.
        index_model_pairs (list of tuples): A list of (index, model, name) triples.

    Returns:
        None. Prints the results for each index-model pair for comparison.
    """
    results_summary = []

    for index, model, name in index_model_pairs:
        print(f"Running search on: {name}")
        print("=" * 80)

        # Capture results
        has_results = False
        query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)
        results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)
        scores = [result["score"] for result in results["matches"]]
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        threshold = mean_score + 0.5 * std_score

        # Collect and print results
        for result in results["matches"]:
            if result["score"] >= threshold:
                metadata = result["metadata"]
                text = metadata.get("text", "N/A")
                publisher_site = metadata.get("publisher_site", "N/A")
                print(f"Score: {result['score']}")
                print(f"Text: {text}")
                print(f"Publisher Site: {publisher_site}")
                print("-" * 50)  # Separator for readability
                results_summary.append({
                    "name": name,
                    "score": result["score"],
                    "text": text,
                    "publisher_site": publisher_site
                })
                has_results = True

        if not has_results:
            print("No sufficiently similar claims found.")
        print("=" * 80)

# Define the indexes, models, and names
index_model_pairs = [
    (pc.Index('connecthack-stella-en-400m-v5-dev'),
     SentenceTransformer('billatsectorflow/stella_en_400M_v5', trust_remote_code=True),
     "stella_en_400M_v5"),
    (pc.Index('connecthack-e5small-dev'),
     SentenceTransformer('intfloat/multilingual-e5-small'),
     "e5small"),
    (pc.Index('connecthack-e5large'),
     SentenceTransformer('intfloat/multilingual-e5-large'),
     "e5large"),
]



Some weights of the model checkpoint at billatsectorflow/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
def compare_search_results_bulk(df_test, test_columns, pc):
    count_of_querries = 0
    # Define the indexes, models, and names
    index_model_pairs = [
        (pc.Index('connecthack-stella-en-400m-v5-dev'),
        SentenceTransformer('billatsectorflow/stella_en_400M_v5', trust_remote_code=True),
        "stella_en_400M_v5"),
        (pc.Index('connecthack-e5small-dev'),
        SentenceTransformer('intfloat/multilingual-e5-small'),
        "e5small"),
        (pc.Index('connecthack-e5large'),
        SentenceTransformer('intfloat/multilingual-e5-large'),
        "e5large"),
    ]

    hit_dict = {}  # Creates an empty dictionary
    hit_dict = {"stella_en_400M_v5": 0, "e5small": 0, "e5large": 0}

    for i in range(30):  # Iterate from 0 to 30 (inclusive)
        for j in range(3):  # Iterate from 1 to 2 (inclusive)
            try:
                count_of_querries =+ 1
                query_text = df_test[test_columns[i]][j]
                print(f"Processing query: {query_text}")  # Optional: Print the query being processed

                """
                Runs the search_similar_claims function over multiple index-model pairs and formats the results for easy comparison.

                Args:
                    query_text (str): The input query text to search for.
                    index_model_pairs (list of tuples): A list of (index, model, name) triples.

                Returns:
                    None. Prints the results for each index-model pair for comparison.
                """
                results_summary = []
                for index, model, name in index_model_pairs:
                    print(f"Running search on: {name}")
                    print("=" * 80)

                    # Capture results
                    has_results = False
                    query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)
                    results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)
                    scores = [result["score"] for result in results["matches"]]
                    mean_score = np.mean(scores)
                    std_score = np.std(scores)
                    threshold = mean_score + 0.5 * std_score

                    # Collect and print results
                    for result in results["matches"]:
                        if result["score"] >= threshold:
                            metadata = result["metadata"]
                            text = metadata.get("text", "N/A")
                            publisher_site = metadata.get("publisher_site", "N/A")
                            print(f"Score: {result['score']}")
                            print(f"Text: {text}")
                            print(f"Publisher Site: {publisher_site}")
                            print("-" * 50)  # Separator for readability
                            results_summary.append({
                                "name": name,
                                "score": result["score"],
                                "text": text,
                                "publisher_site": publisher_site
                            })
                            has_results = True
                            if text == test_columns[i]:
                                hit_dict[name] = hit_dict.get(name,0) + 1
                            print(hit_dict)
                            print(count_of_querries)


                    if not has_results:
                        print("No sufficiently similar claims found.")
                    print("=" * 80)
            except KeyError:
                print(f"Skipping index {i}, element {j} due to KeyError")  # Handle cases where the index or element might not exist
    return hit_dict


In [36]:
compare_search_results_bulk(df_test_tr, test_columns, pc)

Some weights of the model checkpoint at billatsectorflow/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing query: Did Barack Obama refer to his wife as "Michael" during a speech?
Running search on: stella_en_400M_v5




Score: 0.931701481
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v5': 0, 'e5small': 0, 'e5large': 0}
1
Score: 0.883806646
Text: Barack Obama called his wife Michael in a speech
Publisher Site: leadstories.com
--------------------------------------------------
{'stella_en_400M_v5': 1, 'e5small': 0, 'e5large': 0}
1
Running search on: e5small
Score: 0.920745194
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v5': 1, 'e5small': 0, 'e5large': 0}
1
Running search on: e5large
Score: 0.895228267
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v5': 1, 'e5small': 0, 'e5large': 0}
1
Score: 0.894559741
Text: A video shows former President Barack Obama referring to h

KeyboardInterrupt: 

### Testing

In [None]:
# list of synthetic_rephrasals to estimate the quality of the semantic search

In [18]:
json_translated = {
    "Barack Obama called his wife Michael in a speech": [
        "Did Barack Obama refer to his wife as \"Michael\" during a speech?",
        "Did Barack Obama ever refer to his wife as Michael in a speech?",
        "Did Barack Obama refer to his wife as \"Michael\" during a speech?"
    ],
    "Image depicting imprints on the Moon’s surface created by the wheels of the Pragyan rover, featuring the State Emblem of India and ISRO’s logo": [
        "Is it true that images show the Pragyan rover's wheels leaving imprints of the State Emblem of India and ISRO's logo on the Moon's surface?",
        "Did the Pragyan rover leave imprints of the State Emblem of India and ISRO's logo on the Moon's surface with its wheels?",
        "Did the wheels of the Pragyan rover create imprints of the State Emblem of India and ISRO’s logo on the Moon’s surface?"
    ],
    "Video shows 'Mediterranean beef squid'": [
        "Is there a video online showing a creature referred to as a 'Mediterranean beef squid'?",
        "Is it true that a video shows something called a 'Mediterranean beef squid'?",
        "Does a video truly show a 'Mediterranean beef squid'?"
    ],
    "Photo of Grand Ethiopian Renaissance Dam": [
        "Is this a photo of the Grand Ethiopian Renaissance Dam?",
        "Is this a genuine photo of the Grand Ethiopian Renaissance Dam?",
        "Is this a genuine photo of the Grand Ethiopian Renaissance Dam?"
    ],
    "Video shows preparation of artificial cabbages in China": [
        "Is there a video that shows the preparation of artificial cabbages in China?",
        "Does a video exist that shows artificial cabbages being made in China?",
        "Is there a video showing the preparation of artificial cabbages in China?"
    ],
    "Livestream showing protests in Kenya on July 19, 2023": [
        "Was there a livestream of the protests in Kenya on July 19, 2023?",
        "Was there a livestream of protests happening in Kenya on July 19, 2023?",
        "Is there a livestream available that covers the protests in Kenya on July 19, 2023?"
    ],
    "Photograph authentically shows a line outside First Republic Bank on the morning of April 28, 2023.": [
        "Is the photograph of a line outside First Republic Bank from the morning of April 28, 2023, authentic?",
        "Is the photograph showing a line outside First Republic Bank on the morning of April 28, 2023 authentic?",
        "Is the photograph that allegedly shows a line outside First Republic Bank on the morning of April 28, 2023, authentic?"
    ],
    "Photo of a little girl with blood on her face is from the Russian invasion of Ukraine in February 2022.": [
        "Is the photo of a little girl with blood on her face from the Russian invasion of Ukraine in February 2022?",
        "Is the widely shared photo of a little girl with blood on her face actually from the Russian invasion of Ukraine in February 2022?",
        "Is the photo of a little girl with blood on her face actually from the Russian invasion of Ukraine in February 2022?"
    ],
    "France sent troops to fight in the Russia-Ukraine war.": [
        "Did France deploy troops to participate in the Russia-Ukraine war?",
        "Did France send troops to fight in the Russia-Ukraine war?",
        "Did France send troops to fight in the Russia-Ukraine war?"
    ],
    "Vice President Kamala Harris was using the help of a teleprompter during a Univision town hall appearance.": [
        "Did Vice President Kamala Harris use a teleprompter during her Univision town hall appearance?",
        "Did Vice President Kamala Harris use a teleprompter during her appearance at a Univision town hall?",
        "Was Vice President Kamala Harris using a teleprompter during her Univision town hall appearance?"
    ],
    "Applying ginger oil on belly button would burn fat ten times faster and eventually help in weight loss": [
        "Does applying ginger oil to the belly button significantly accelerate fat burning and aid in weight loss?",
        "Is it true that applying ginger oil on the belly button can burn fat ten times faster and aid in weight loss?",
        "Is it true that applying ginger oil to the belly button can burn fat ten times faster and aid in weight loss?"
    ],
    "United States President-elect Donald Trump has visited Vice President Sara Duterte to forge an alliance with her family": [
        "Did United States President-elect Donald Trump visit Vice President Sara Duterte to forge an alliance with her family?",
        "Did President-elect Donald Trump visit Vice President Sara Duterte to establish an alliance with her family?",
        "Did United States President-elect Donald Trump visit Vice President Sara Duterte to form an alliance with her family?"
    ],
    "Social media post claiming that this is the photo of the recent Southport attacker.": [
        "Is this photo really of the recent Southport attacker, as claimed on social media?",
        "Is this photo actually of the recent Southport attacker, as claimed on social media?",
        "Is this photo really of the recent Southport attacker, as claimed on social media?"
    ],
    "The meeting between Bongbong Marcos and the Australian prime minister in September 2023 had something to do with retrieving the Marcos family’s gold being kept in Australia.": [
        "Did the meeting between Bongbong Marcos and the Australian prime minister in September 2023 involve discussions about retrieving the Marcos family's gold stored in Australia?",
        "Did the meeting between Bongbong Marcos and the Australian prime minister in September 2023 involve discussions about retrieving the Marcos family’s gold from Australia?",
        "Did the meeting between Bongbong Marcos and the Australian prime minister in September 2023 involve discussions about retrieving the Marcos family’s gold supposedly kept in Australia?"
    ],
    "Photo shows crowd at Trump rally in New York City": [
        "Is the photo actually of a crowd at a Trump rally in New York City?",
        "Is the photo real that supposedly shows a crowd at a Trump rally in New York City?",
        "Is the photo really from a Trump rally in New York City?"
    ],
    "An image shows an eBay listing for a \"used\" Russian tank that was recently seized by Ukrainians.": [
        "Is it true that an image on eBay is showing a listing for a \"used\" Russian tank that was recently seized by Ukrainians?",
        "Is there a real eBay listing showing a \"used\" Russian tank that was recently seized by Ukrainians?",
        "Is there an actual eBay listing showing a \"used\" Russian tank that was seized by Ukrainians?"
    ]
}


In [19]:
# Create the pandas DataFrame
df_test_tr = pd.DataFrame.from_dict(json_translated)

In [20]:
df_test_tr

Unnamed: 0,Barack Obama called his wife Michael in a speech,"Image depicting imprints on the Moon’s surface created by the wheels of the Pragyan rover, featuring the State Emblem of India and ISRO’s logo",Video shows 'Mediterranean beef squid',Photo of Grand Ethiopian Renaissance Dam,Video shows preparation of artificial cabbages in China,"Livestream showing protests in Kenya on July 19, 2023","Photograph authentically shows a line outside First Republic Bank on the morning of April 28, 2023.",Photo of a little girl with blood on her face is from the Russian invasion of Ukraine in February 2022.,France sent troops to fight in the Russia-Ukraine war.,Vice President Kamala Harris was using the help of a teleprompter during a Univision town hall appearance.,Applying ginger oil on belly button would burn fat ten times faster and eventually help in weight loss,United States President-elect Donald Trump has visited Vice President Sara Duterte to forge an alliance with her family,Social media post claiming that this is the photo of the recent Southport attacker.,The meeting between Bongbong Marcos and the Australian prime minister in September 2023 had something to do with retrieving the Marcos family’s gold being kept in Australia.,Photo shows crowd at Trump rally in New York City,"An image shows an eBay listing for a ""used"" Russian tank that was recently seized by Ukrainians."
0,"Did Barack Obama refer to his wife as ""Michael...",Is it true that images show the Pragyan rover'...,Is there a video online showing a creature ref...,Is this a photo of the Grand Ethiopian Renaiss...,Is there a video that shows the preparation of...,Was there a livestream of the protests in Keny...,Is the photograph of a line outside First Repu...,Is the photo of a little girl with blood on he...,Did France deploy troops to participate in the...,Did Vice President Kamala Harris use a telepro...,Does applying ginger oil to the belly button s...,Did United States President-elect Donald Trump...,Is this photo really of the recent Southport a...,Did the meeting between Bongbong Marcos and th...,Is the photo actually of a crowd at a Trump ra...,Is it true that an image on eBay is showing a ...
1,Did Barack Obama ever refer to his wife as Mic...,Did the Pragyan rover leave imprints of the St...,Is it true that a video shows something called...,Is this a genuine photo of the Grand Ethiopian...,Does a video exist that shows artificial cabba...,Was there a livestream of protests happening i...,Is the photograph showing a line outside First...,Is the widely shared photo of a little girl wi...,Did France send troops to fight in the Russia-...,Did Vice President Kamala Harris use a telepro...,Is it true that applying ginger oil on the bel...,Did President-elect Donald Trump visit Vice Pr...,Is this photo actually of the recent Southport...,Did the meeting between Bongbong Marcos and th...,Is the photo real that supposedly shows a crow...,"Is there a real eBay listing showing a ""used"" ..."
2,"Did Barack Obama refer to his wife as ""Michael...",Did the wheels of the Pragyan rover create imp...,Does a video truly show a 'Mediterranean beef ...,Is this a genuine photo of the Grand Ethiopian...,Is there a video showing the preparation of ar...,Is there a livestream available that covers th...,Is the photograph that allegedly shows a line ...,Is the photo of a little girl with blood on he...,Did France send troops to fight in the Russia-...,Was Vice President Kamala Harris using a telep...,Is it true that applying ginger oil to the bel...,Did United States President-elect Donald Trump...,Is this photo really of the recent Southport a...,Did the meeting between Bongbong Marcos and th...,Is the photo really from a Trump rally in New ...,"Is there an actual eBay listing showing a ""use..."


In [21]:
test_columns = df_test_tr.columns

In [None]:
### testing function

In [24]:
def compare_search_results_bulk(df_test, test_columns, pc):
    count_of_querries = 0
    # Define the indexes, models, and names
    index_model_pairs = [
        (pc.Index('connecthack-stella-en-400m-v5-dev'),
        SentenceTransformer('billatsectorflow/stella_en_400M_v5', trust_remote_code=True),
        "stella_en_400M_v5"),
        (pc.Index('connecthack-e5small-dev'),
        SentenceTransformer('intfloat/multilingual-e5-small'),
        "e5small"),
        (pc.Index('connecthack-e5large'),
        SentenceTransformer('intfloat/multilingual-e5-large'),
        "e5large"),
    ]

    hit_dict = {}  # Creates an empty dictionary
    hit_dict = {"stella_en_400M_v5": 0, "e5small": 0, "e5large": 0}

    for i in range(16):  # Iterate from 0 to 30 (inclusive)
        for j in range(3):  # Iterate from 1 to 2 (inclusive)
            try:
                count_of_querries += 1
                query_text = df_test[test_columns[i]][j]
                print(f"Processing query: {query_text}")  # Optional: Print the query being processed

                """
                Runs the search_similar_claims function over multiple index-model pairs and formats the results for easy comparison.

                Args:
                    query_text (str): The input query text to search for.
                    index_model_pairs (list of tuples): A list of (index, model, name) triples.

                Returns:
                    None. Prints the results for each index-model pair for comparison.
                """
                results_summary = []
                for index, model, name in index_model_pairs:
                    print(f"Running search on: {name}")
                    print("=" * 80)

                    # Capture results
                    has_results = False
                    query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)
                    results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)
                    scores = [result["score"] for result in results["matches"]]
                    mean_score = np.mean(scores)
                    std_score = np.std(scores)
                    threshold = mean_score + 0.5 * std_score

                    # Collect and print results
                    for result in results["matches"]:
                        if result["score"] >= threshold:
                            metadata = result["metadata"]
                            text = metadata.get("text", "N/A")
                            publisher_site = metadata.get("publisher_site", "N/A")
                            print(f"Score: {result['score']}")
                            print(f"Text: {text}")
                            print(f"Publisher Site: {publisher_site}")
                            print("-" * 50)  # Separator for readability
                            results_summary.append({
                                "name": name,
                                "score": result["score"],
                                "text": text,
                                "publisher_site": publisher_site
                            })
                            has_results = True
                            if text == test_columns[i]:
                                hit_dict[name] = hit_dict.get(name,0) + 1
                            print(hit_dict)
                            print(count_of_querries)


                    if not has_results:
                        print("No sufficiently similar claims found.")
                    print("=" * 80)
            except KeyError:
                print(f"Skipping index {i}, element {j} due to KeyError")  # Handle cases where the index or element might not exist
    return hit_dict


In [25]:
compare_search_results_bulk(df_test_tr, test_columns, pc)

Some weights of the model checkpoint at billatsectorflow/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing query: Did Barack Obama refer to his wife as "Michael" during a speech?
Running search on: stella_en_400M_v5
Score: 0.931701481
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v5': 0, 'e5small': 0, 'e5large': 0}
1
Score: 0.883806646
Text: Barack Obama called his wife Michael in a speech
Publisher Site: leadstories.com
--------------------------------------------------
{'stella_en_400M_v5': 1, 'e5small': 0, 'e5large': 0}
1
Running search on: e5small
Score: 0.920745194
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v5': 1, 'e5small': 0, 'e5large': 0}
1
Running search on: e5large
Score: 0.895228267
Text: Barack Obama referred to his wife as Michael in a speech
Publisher Site: usatoday.com
--------------------------------------------------
{'stella_en_400M_v

{'stella_en_400M_v5': 48, 'e5small': 46, 'e5large': 46}

In [104]:
# Query text
query = "Putin denied interference in the 2020 U.S. elections."

# Run the comparison
compare_search_results(query, index_model_pairs)

Running search on: stella_en_400M_v5
Score: 0.715316117
Text: Video shows Russian President Vladimir Putin speaking English to warn Americans about election interference.
Publisher Site: apnews.com
--------------------------------------------------
Score: 0.699208617
Text: An announcement by U.S. intelligence officials that there's no foreign interference in U.S. elections through early September 2024 undermines the Department of Justice's indictment of alleged Russian operatives.
Publisher Site: leadstories.com
--------------------------------------------------
Running search on: e5small
Score: 0.876550376
Text: An announcement by U.S. intelligence officials that there's no foreign interference in U.S. elections through early September 2024 undermines the Department of Justice's indictment of alleged Russian operatives.
Publisher Site: leadstories.com
--------------------------------------------------
Running search on: e5large
Score: 0.848732233
Text: An announcement by U.S. intellig

In [105]:
# Query text
query = "Russia is forcibly relocating Ukrainian children to Russian territory."

# Run the comparison
compare_search_results(query, index_model_pairs)

Running search on: stella_en_400M_v5
Score: 0.73999238
Text: “Dozens Of Ukrainians” Asked Russia For Help In Getting Their Children Back From European Union
Publisher Site: leadstories.com
--------------------------------------------------
Running search on: e5small
Score: 0.869551361
Text: Photograph shows children bidding farewell to Ukrainian forces amid recent war with Russia.
Publisher Site: thequint.com
--------------------------------------------------
Score: 0.865870059
Text: Дітей в Україні змушують здавати кров для потреб ЗСУ
Publisher Site: stopfake.org
--------------------------------------------------
Running search on: e5large
Score: 0.854775846
Text: Photograph shows children bidding farewell to Ukrainian forces amid recent war with Russia.
Publisher Site: thequint.com
--------------------------------------------------


In [137]:
def search_similar_claims_stella(query_text):
    """
    Searches Pinecone for claims similar to the input query text.

    Args:
        query_text (str): The input query text to search for.

    Returns:
        Prints each match with scores and metadata line by line if they meet the threshold,
        or a message indicating no matches found.
    """

    index = pc.Index('connecthack-stella-en-400m-v5-dev')
    model = model_stella_en_400M_v5
    # Generate embedding for the query
    query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

    # Query Pinecone
    results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

    # Calculate scores and apply threshold
    scores = [result["score"] for result in results["matches"]]
    mean_score = np.mean(scores)
    std_score = np.std(scores)

    # Threshold logic (minimum of 0.83 or mean + 0.5*std)
    threshold = mean_score + 0.5 * std_score

    has_results = False
    for result in results["matches"]:
        if result["score"] >= threshold:
            print(f"Score: {result['score']}")
            print(f"Metadata: {result['metadata']}")
            print("-" * 50)  # Separator for readability
            has_results = True

    if not has_results:
        print("No sufficiently similar claims found.")

In [29]:
def search_similar_claims_basic(query_text):
    """
    Searches Pinecone for claims similar to the input query text.

    Args:
        query_text (str): The input query text to search for.

    Returns:
        Prints each match with scores and metadata line by line, or a message indicating no matches found.
    """
    # Generate embedding for the query
    query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

    # Query Pinecone
    results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

    # Output all matches
    has_results = False
    for result in results["matches"]:
        print(f"Score: {result['score']}")
        print(f"Metadata: {result['metadata']}")
        print("-" * 50)  # Separator for readability
        has_results = True

    if not has_results:
        print("No matches found.")


In [138]:
# to improve

query = "Over 10 million Ukrainians have fled the country since 2022"
results = search_similar_claims_stella(query)
print(results)



Score: 0.682023942
Metadata: {'claimDate': '1692403200', 'claimant': 'Missing', 'languageCode': 'ru', 'publisher_name': 'Проверено.Медиа', 'publisher_site': 'provereno.media', 'reviewDate': '1694304000', 'simplified_textualRating': 'false', 'text': 'с февраля 2022 года в Украине погибло более 10 тысяч польских военных', 'text_en': 'Since February 2022, more than 10,000 Polish military personnel have died in Ukraine.', 'textualRating': 'Фейк', 'title': 'Правда ли, что с февраля 2022 года в Украине погибло более 10 000 польских военных?', 'url': 'https://provereno.media/blog/2023/09/10/pravda-li-chto-s-fevralya-2022-goda-v-ukraine-pogiblo-bolee-10000-polskih-voennyh/'}
--------------------------------------------------
Score: 0.670388699
Metadata: {'claimDate': '1733961600', 'claimant': 'Multiple authors', 'languageCode': 'en', 'publisher_name': 'AFP Fact Check', 'publisher_site': 'factcheck.afp.com', 'reviewDate': '1734467640', 'simplified_textualRating': 'false', 'text': 'ABC News repo

In [139]:
# to improve

query = "Trump said that he is going to end the war in ukraine"
results = search_similar_claims_stella(query)
print(results)

Score: 0.698568881
Metadata: {'claimDate': '1670803200', 'claimant': 'Social media', 'languageCode': 'en', 'publisher_name': 'USA Today', 'publisher_site': 'usatoday.com', 'reviewDate': '1671651012', 'simplified_textualRating': 'false', 'text': 'The war in Ukraine is over', 'text_en': 'The war in Ukraine is over', 'textualRating': 'False', 'title': 'Fact check: War in Ukraine continues, contrary to post', 'url': 'https://www.usatoday.com/story/news/factcheck/2022/12/21/fact-check-russia-ukraine-war-continues-contrary-post/10927862002/'}
--------------------------------------------------
Score: 0.697245717
Metadata: {'claimDate': '1731888000', 'claimant': 'social media', 'languageCode': 'en', 'publisher_name': 'USA Today', 'publisher_site': 'usatoday.com', 'reviewDate': '1732736079', 'simplified_textualRating': 'false', 'text': 'Trump said he would stop Biden from allowing Ukraine to use US missiles', 'text_en': 'Trump said he would stop Biden from allowing Ukraine to use US missiles', 

In [None]:
# to improve

query = "Война в Украине закончится в 2025"
results = search_similar_claims(query)
print(results)

Score: 0.879831493
Metadata: {'claimDate': '2024-12-15 00:00:00+00:00', 'claimant': 'Viral social media post', 'languageCode': 'ru', 'publisher_name': 'StopFake', 'publisher_site': 'stopfake.org', 'reviewDate': '2024-12-17 00:00:00+00:00', 'text': 'Украина планирует сократить в 2025 году расходы на протезирование для военных', 'textualRating': 'В Украине никто не заявлял о планах правительства сократить расходы на протезирование и реабилитацию военных. Государство и впредь будет продолжать обеспечивать бесплатные услуги военным и выделило на это соответствующие средства.', 'title': 'Фейк: Украина планирует сократить в 2025 году расходы на протезирование для военных', 'url': 'https://www.stopfake.org/ru/fejk-ukraina-planiruet-sokratit-v-2025-godu-rashody-na-protezirovanie-dlya-voennyh/'}
--------------------------------------------------
Score: 0.878397882
Metadata: {'claimDate': '2022-02-01 00:00:00+00:00', 'claimant': 'viral social media post', 'languageCode': 'ru', 'publisher_name': 

In [None]:
# to improve
query = "Biden criticized Trump for handling the pandemic."
results = search_similar_claims(query)
print(results)

Score: 0.859377861
Metadata: {'claimDate': '2024-02-20 00:00:00+00:00', 'claimant': 'Donald Trump', 'languageCode': 'en', 'publisher_name': 'FactCheck.org', 'publisher_site': 'factcheck.org', 'reviewDate': '2024-02-21 23:51:52+00:00', 'text': '“Well, they want him [Biden] very badly to be president,” referring to world leaders, including Russian President Vladimir Putin.', 'textualRating': 'Unsupported', 'title': 'Trump Repeats Many Claims in Fox News Town Hall', 'url': 'https://www.factcheck.org/2024/02/trump-repeats-many-claims-in-fox-news-town-hall/'}
--------------------------------------------------
None


In [None]:
query = "Ukranian education wont focus on science anymore"
results = search_similar_claims(query)
print(results)

Score: 0.891417742
Metadata: {'claimDate': '2025-01-11 00:00:00+00:00', 'claimant': 'Viral social media post', 'languageCode': 'ru', 'publisher_name': 'StopFake', 'publisher_site': 'stopfake.org', 'reviewDate': '2025-01-13 00:00:00+00:00', 'text': 'С 2027 года в украинских школах перестанут изучать естественные науки', 'textualRating': 'Тезис о том, что с 2027 года в украинских школах заберут из программы физику, химию, биологию и географию является манипулятивной, так как искажает суть предложений по образовательной реформе. В действительности эти предметы останутся в школьной программе как обязательные для 10 класса, а вот в 11–12 классах их можно будет изучать как выборочные дисциплины в зависимости от профиля обучения. Реформа направлена на повышение качества образования и предоставляет ученикам больше возможностей для углубленного изучения тех предметов, которые соответствуют их интересам и карьерным планам.', 'title': 'Манипуляция: С 2027 года в украинских школах перестанут изуча

In [None]:
query = "Байден серьезно болен"
results = search_similar_claims(query)
print(results)

Score: 0.896834552
Metadata: {'claimDate': '2023-12-21T00:00:00Z', 'claimant': 'Виральные посты в соцсетях и СМИ', 'languageCode': 'ru', 'publisher_name': 'Проверено.Медиа', 'publisher_site': 'provereno.media', 'reviewDate': '2024-01-10T00:00:00Z', 'text': 'Байден обнимается с пустотой', 'textualRating': 'Вырвано из контекста', 'title': 'Правдиво ли видео, на котором Байден пытается обнять невидимого собеседника?', 'url': 'https://provereno.media/blog/2024/01/10/pravdivo-li-video-na-kotorom-bayden-pytaetsya-obnyat-nevidimogo-sobesednika-1/'}
--------------------------------------------------
Score: 0.895303786
Metadata: {'claimDate': '2024-06-06T00:00:00Z', 'claimant': 'Виральные посты в соцсетях', 'languageCode': 'ru', 'publisher_name': 'Проверено.Медиа', 'publisher_site': 'provereno.media', 'reviewDate': '2024-06-07T00:00:00Z', 'text': 'Байден попытался сесть на несуществующий стул', 'textualRating': 'Неправда', 'title': 'Правда ли, что Байден попытался сесть на несуществующий стул?'

In [None]:
query = "Ukraine kill russian innocent civilians"
results = search_similar_claims(query)
print(results)

Score: 0.888457358
Metadata: {'claimDate': '2022-03-18 06:56:00+00:00', 'claimant': 'Arc.мuнаммаd', 'languageCode': 'ar', 'publisher_name': 'مسبار', 'publisher_site': 'misbar.com', 'reviewDate': '2022-03-18 06:56:00+00:00', 'text': 'A video depicts Ukrainian soldiers killing civilians in Chechnya.', 'textualRating': 'زائف', 'title': 'Movie Clip Falsely Shared as Ukrainian Soldier Killing Chechan Civilians in 1999', 'url': 'https://misbar.com/factcheck/2022/03/18/movie-clip-falsely-shared-as-ukrainian-soldier-killing-chechan-civilians-in-1999'}
--------------------------------------------------
Score: 0.888457358
Metadata: {'claimDate': '2022-03-18 06:56:00+00:00', 'claimant': 'Arc.мuнаммаd', 'languageCode': 'en', 'publisher_name': 'Missing', 'publisher_site': 'misbar.com', 'reviewDate': '2022-03-18 06:56:00+00:00', 'text': 'A video depicts Ukrainian soldiers killing civilians in Chechnya.', 'textualRating': 'Fake', 'title': 'Movie Clip Falsely Shared as Ukrainian Soldier Killing Checha

In [None]:
query = "Putin denied interference in the 2020 U.S. elections."
results = search_similar_claims(query)
print(results)

Score: 0.876550257
Metadata: {'claimDate': '2024-09-11T00:41:00Z', 'claimant': 'zerohedge.com', 'languageCode': 'en', 'publisher_name': 'Lead Stories', 'publisher_site': 'leadstories.com', 'reviewDate': '2024-09-11T00:41:00Z', 'text': "An announcement by U.S. intelligence officials that there's no foreign interference in U.S. elections through early September 2024 undermines the Department of Justice's indictment of alleged Russian operatives.", 'textualRating': 'Not Same Thing', 'title': "Fact Check: US Intelligence Does NOT Undercut DOJ With 'No Foreign Interference' Announcement", 'url': 'https://leadstories.com/hoax-alert/2024/09/fact-check-us-intelligence-does-not-undercut-doj-with-no-foreign-interference-claim.html'}
--------------------------------------------------
None


In [None]:
query = "Stepan Khramov is a very handscome individual"
results = search_similar_claims(query)
print(results)

No sufficiently similar claims found.
None


In [None]:
# pc.create_index(name='connecthack-e5small', dimension=384,spec=ServerlessSpec(cloud='aws',region='us-east-1'))

In [None]:
# index = pc.Index('connecthack-e5small')

In [None]:
# index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 58109}},
 'total_vector_count': 58109}

In [None]:
# # Prepare data for Pinecone
# def prepare_pinecone_data(df):
#     data = []
#     for idx, row in df.iterrows():
#         metadata = {
#             "text": row["text"],
#             "claimant": row["claimant"],
#             "claimDate": str(row["claimDate"]),  # Convert to string
#             "publisher_name": row["publisher_name"],
#             "publisher_site": row["publisher_site"],
#             "url": row["url"],
#             "title": row["title"],
#             "reviewDate": str(row["reviewDate"]),  # Convert to string
#             "textualRating": row["textualRating"],
#             "languageCode": row["languageCode"]
#         }
#         data.append((str(idx), row["embedding"], metadata))
#     return data

# # Upload data
# pinecone_data = prepare_pinecone_data(df_claims)

# # Set the batch size
# batch_size = 100

# # Add progress bar
# for i in tqdm(range(0, len(pinecone_data), batch_size), desc="Uploading to Pinecone", unit="batch"):
#     batch = pinecone_data[i:i + batch_size]
#     index.upsert(vectors=batch)

KeyError: 'embedding'

In [None]:
# # Load the model
# model = SentenceTransformer('intfloat/multilingual-e5-small')

# def query_with_threshold(index, model, query_text):

#     # Threshold value
#     threshold = 0.83

#     # Generate embedding for the query
#     query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

#     # Query Pinecone
#     results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

#     # Display results with scores above the threshold
#     found = False
#     for result in results["matches"]:
#         if result["score"] >= threshold:
#             print(f"Score: {result['score']}")
#             print(f"Metadata: {result['metadata']}")
#             print("----")
#             found = True
#         else:
#             break

#     if not found:
#         print("No sufficiently similar claims found.")


In [None]:
# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# scores = [result["score"] for result in results["matches"]]
# mean_score = np.mean(scores)
# std_score = np.std(scores)

# threshold = mean_score + 0.5 * std_score  # Example adjustment
# for result in results["matches"]:
#     if result["score"] >= threshold:
#         print(f"Score: {result['score']}")
#         print(f"Metadata: {result['metadata']}")
#     else:
#         print("No sufficiently similar claims found.")
#         break


In [None]:
# query_text = "COVID-19 vaccines are harmful"
# query_with_threshold(index=index, model=model, query_text=query_text)

In [None]:
# # New claim text
# query_text = "Зеленский употребляет наркотики"
# query_with_threshold(index=index, model=model, query_text=query_text)

In [None]:
# # New claim text
# query_text = "Байден серьезно болен"
# query_with_threshold(index=index, model=model, query_text=query_text)

In [None]:
# # New claim text
# query_text = "Lenin didnt believe that the soviet revolution would happen"

# # Generate embedding for the query
# query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# # Display results
# for result in results["matches"]:
#     print(f"Score: {result['score']}")
#     print(f"Metadata: {result['metadata']}")
#     print("----")


In [None]:
# # New claim text
# query_text = "Ukraine kill russian innocent civilians"

# # Generate embedding for the query
# query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# # Display results
# for result in results["matches"]:
#     print(f"Score: {result['score']}")
#     print(f"Metadata: {result['metadata']}")
#     print("----")


In [None]:
# # Load the model
# model = SentenceTransformer('intfloat/multilingual-e5-small')

# # New claim text
# query_text = "Biden criticized Trump for handling the pandemic."

# # Generate embedding for the query
# query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# # Display results
# for result in results["matches"]:
#     print(f"Score: {result['score']}")
#     print(f"Metadata: {result['metadata']}")
#     print("----")


In [None]:
# # New claim text
# query_text = "Putin denied interference in the 2020 U.S. elections."

# # Generate embedding for the query
# query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# # Display results
# for result in results["matches"]:
#     print(f"Score: {result['score']}")
#     print(f"Metadata: {result['metadata']}")
#     print("----")


In [None]:
# # New claim text
# query_text = "Stepan Khramov is a very handsome individual"

# # Generate embedding for the query
# query_embedding = model.encode([f"query: {query_text}"], normalize_embeddings=True)

# # Query Pinecone
# results = index.query(vector=query_embedding[0].tolist(), top_k=5, include_metadata=True)

# # Display results
# for result in results["matches"]:
#     print(f"Score: {result['score']}")
#     print(f"Metadata: {result['metadata']}")
#     print("----")
