The purpose of the code cell below is to query the Tobacco Truth Industry Document repository using Solr API to retrieve document IDs based on a specified query. The retrieved document IDs are stored in a list called ids_list

In [1]:
#Import the requests library.This library is used to send HTTP requests to the Solr API.
import requests

The **query_solr_api function** takes the query string and optional parameters (format, sort_field, and cursor_mark).
**base_api** constructs the base URL for the Solr API request.
ids_list is an empty list to store retrieved document IDs.

In [None]:
ids_list = []

# Define the query_solr_api function
def query_solr_api(query, format='json', sort_field='id', cursor_mark='*'):
    global ids_list  # Declare ids_list as global
    base_api = f'https://solr.idl.ucsf.edu/solr/ltdl3/query?q={query}&wt={format}'
    ids_list = []  # Initialize the global ids_list

    # while loop to retrieve documents, the loop will run as long as the cursor mark is valid
    while cursor_mark:
        api_final = f"{base_api}&cursorMark={cursor_mark}&sort={sort_field}%20desc"  # constructs the final API url with the current cursor_mark and sorting. 
        try:
            response = requests.get(api_final)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            break

        data = response.json()
        documents = data.get('response', {}).get('docs', [])  # extracts the list of documents from the response.
        ids_list.extend([doc['id'] for doc in documents if 'id' in doc])  # adds the id of each document to ids_list if it exists.

        # prints information about the current step
        print(f"Retrieved {len(documents)} documents. Total IDs collected: {len(ids_list)}")

        # retrieves the next cursor mark for next set of documents. Checks if the cursor_mark hasn't changed to avoid an infinite loop, breaking the loop if it hasn't.
        next_cursor_mark = data.get('nextCursorMark', None)
        if cursor_mark == next_cursor_mark:
            break
        cursor_mark = next_cursor_mark

    # Final output information
    print(f"Total number of document IDs collected: {len(ids_list)}")
    print(f"Last cursor mark used: {cursor_mark}")

    return ids_list

# Example usage of the function
def main():
    query = '(collection:"JUUL labs Collection" AND case:"State of North Carolina" AND "secret shop" AND type:"email")'
    ids_list = query_solr_api(query)
    print(f"Collected document IDs: {ids_list}")

# Ensures that main function is called only if the script is run directly (not imported as a module)
if __name__ == "__main__":
    main()

'(collection:"JUUL labs Collection" AND case:"State of North Carolina" AND "secret shop" AND: type:"email")'

In [2]:
import zipfile
import csv
import os
import sys

In [33]:
zip_file_path = 'data\JUUL_Labs_Collection.zip'
output_csv = 'data\ocr_texts_age_verification.csv'

# Define the extract_ocr_data_from_csvs function
def extract_ocr_data_from_csvs(zip_file_path, ids_list, output_csv):
    ids_set = set(ids_list)

    with zipfile.ZipFile(zip_file_path, 'r') as z:
        with open(output_csv, 'w', newline='', encoding='utf-8') as out_file:
            writer = csv.writer(out_file)
            writer.writerow(['id', 'text'])

            for file_name in z.namelist():
                with z.open(file_name) as csvfile:
                    lines = csvfile.read().decode('utf-8').splitlines()
                    for line in lines:
                        row = line.replace('\0', '').split('|')
                        if row and row[0] in ids_set:
                            # print(f"Processing ID: {row[0]}")
                            writer.writerow([row[0], row[-1]])


extract_data_from_csvs(zip_file_path, ids_list, output_csv)

OverflowError: Python int too large to convert to C long

In [None]:
import re

In [48]:
def extract_emails(ocr_text):
    # Split the text into individual emails
    email_pattern = r'(?=From:\s)|(?=To:\s)|(?=Sent:\s)|(?=Subject:\s)'
    emails = re.split(email_pattern, ocr_text)

    # Initialize an empty list to hold individual email data
    email_data = []

    # Iterate through each split email section
    for email in emails:
        if not email.strip():
            continue

        # Extract the body of the email
        body_start = email.find('\n\n') + 2
        body = email[body_start:].strip() if body_start > 1 else email.strip()

        # Append the extracted data to the email_data list if the body has content
        if body:
            email_data.append({
                'Body': body
            })
    
    return email_data

def process_csv(input_csv, output_csv, num_documents=500):
    # Read the input CSV file
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        rows = list(reader)[:num_documents]  # Process only the first `num_documents` rows
    
    # Prepare the output CSV file
    csv_columns = ['DocumentID', 'Body']
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()

        # Process each row in the input CSV
        for row in rows:
            document_id = row['id']
            ocr_text = row['text']
            emails = extract_emails(ocr_text)
            
            for email in emails:
                if email['Body'].strip():  # Check if the body has content
                    email['DocumentID'] = document_id
                    writer.writerow(email)
    #print(f"Extracted emails from first {num_documents} documents saved to {output_csv}")

# Define input and output CSV file paths
input_csv = 'data\ocr_texts_secret_shopper.csv'  # Input CSV file path
output_csv = 'extracted_ocr_text.csv'  # Output CSV file path

# Process the first 100 documents from the input CSV and save extracted emails to the output CSV
process_csv(input_csv, output_csv, num_documents=500)

In [1]:
import pandas as pd
from transformers import pipeline

In [None]:
%pip install transformers
%pip install keras
%pip install tensorflow

In [3]:
# Read the CSV file
input_csv = 'extracted_ocr_text.csv'
df = pd.read_csv(input_csv)

# Use the transformer3/H2-keywordextractor to extract keywords
pipe = pipeline("summarization", model="transformer3/H2-keywordextractor")

def extract_keywords(text):
    # Using the summarization pipeline to extract keywords
    keywords = pipe(text, max_length=512, truncation=True)[0]['summary_text']
    return keywords

df['Keywords'] = df['Body'].apply(extract_keywords)

# Store the keywords and DocumentID in a new CSV file
output_csv = 'secret_shopper_keywords.csv'
df[['DocumentID', 'Keywords']].to_csv(output_csv, index=False)

RuntimeError: Failed to import transformers.models.bart.modeling_tf_bart because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.