In [1]:
import pandas as pd
import json
from pydantic import BaseModel, Field, ValidationError, field_validator
from typing import List, Dict, Any
import ast

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import Ollama

In [2]:
listings = pd.read_parquet('data/L1L2_mixed_2024Feb-2024May_Incognito.parquet')
#listings.head()
unique_listings=pd.DataFrame(listings['drug_title'].unique(),columns=['listing'])

In [3]:
def dataframe_batches(df, batch_size=5):
    column_name = df.columns[0]
    total_rows = len(df)
    results = []

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch = df.iloc[start:end]
        
        batch_result = {str(element): [] for element in batch[column_name]}
        results.append(batch_result)

    return results

class ResponseChecks(BaseModel):
    data: Dict[str, List[Any]]
    
    @field_validator("data")
    def check_dict_format(cls, value):
        assert isinstance(value, dict), "Response must be a dictionary"
        assert len(value) > 0, "Dictionary must not be empty"
        for key, val in value.items():
            assert isinstance(val, list), f"Value for key '{key}' must be a list"
        return value

def execute_keyword_LLM(list_of_dicts, chain):
    results = []
    error_listings = []
    total_items = len(list_of_dicts)
    
    for index, item in enumerate(list_of_dicts, 1):
        # Display progress every 10 runs
        if index % 10 == 0 or index == total_items:
            print(f"Progress: {index}/{total_items} items processed")
        
        try:
            response = chain.run(test_data=item)
            output = ast.literal_eval(response)
            
            # Validate the output
            ResponseChecks(data=output)
            
            # If validation passes, add to results
            for key, value in output.items():
                results.append({"key": key, "value": value})
        
        except Exception as e:
            print(f"Error processing item {index}: {str(e)}")
            error_listings.append(item)
            continue
    
    # Create DataFrame from results
    df = pd.DataFrame(results)
    
    # Print summary
    print(f"\nProcessing complete.")
    print(f"Total items: {total_items}")
    print(f"Successful: {len(list_of_dicts) - len(error_listings)}")
    print(f"Errors: {len(error_listings)}")
    
    return df, error_listings

In [4]:
#Break the data frame into batches and convert it to dict for processing
list_of_dicts=dataframe_batches(unique_listings)

In [5]:
list_of_dicts = list_of_dicts[:40]

In [38]:
# Define the input and output formats
input_format = r"""{{
  'listing text here': [],
  'listing text here': [],
  ...
}}"""

output_format = r"""{{
  'listing text here': ['Main drug sold'],
  'listing text here': ['Main drug sold'],
  ...
}}"""

example_input = r"""{{
  'Selling A which is like B': [],
  '50x drug A having effects similar to B': [],
  'Selling A, B, C etc.': []
}}"""

example_output = r"""{{
  'Selling A which is like B': [A],
  '50x drug A having effects similar to B': [A],
  'Selling A, B, C etc.': []
}}"""

# Define the template for the prompt
template = f"""You are an expert and your task is to identify just the main drug sold from the given listing. There might be listings which compare a drug to another to explain the effects, but your job is to identify just the main drug sold. If there are multiple main drugs being sold, just keep the response empty. Follow the following Input/Output format, and an example is given for your understanding. Maximum of only 1 item can be the output for a given listing.

Input format:
{input_format}

Output format:
{output_format}

Example

Input:
{example_input}

Output:
{example_output}

Now perform the task on the following and strictly follow the output format and task instructions given above. Just return the output in the given format without any pre or post explanation or message. Make sure to identify the main drug sold, not the analogues or similar.:

{{test_data}}
"""

In [6]:
# Define the input and output formats
input_format = r"""{{
  'listing text here': [],
  'listing text here': [],
  ...
}}"""

output_format = r"""{{
  'listing text here': ['Main drug sold'],
  'listing text here': ['Main drug sold'],
  ...
}}"""


# Define the template for the prompt
template = f"""Identify the main drug sold in each listing. There can be either 0 or 1 main drug sold. If there are more than 1 or if you are unsure, just give empty response.

Input format:
{input_format}

Output format:
{output_format}

Now perform the task on the following and strictly follow the output format and task instructions given above. Just return the output in the given format without any pre or post explanation or message. Make sure to identify the main drug sold, not the analogues or similar.:

{{test_data}}
"""

In [7]:
# Create a PromptTemplate instance
prompt = PromptTemplate(template=template, input_variables=["test_data"])

# Initialize the Ollama LLM with the correct model name
#llm = Ollama(model="llama-3.1-8b-grimjim:latest")
llm = Ollama(model="llama-3.1-8-grimjim-Q8:latest") 

# Create the LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

# Run the chain with the first element of list_of_dicts
'''response = chain.run(test_data=list_of_dicts[0])
try:
    output=ast.literal_eval(response)
except:
    output=''
    print('Invalid Response')
print(response)'''

  chain = LLMChain(llm=llm, prompt=prompt)


"response = chain.run(test_data=list_of_dicts[0])\ntry:\n    output=ast.literal_eval(response)\nexcept:\n    output=''\n    print('Invalid Response')\nprint(response)"

In [8]:
result_df, errors_df = execute_keyword_LLM(list_of_dicts, chain)


  response = chain.run(test_data=item)


Progress: 10/40 items processed
Progress: 20/40 items processed
Progress: 30/40 items processed
Progress: 40/40 items processed

Processing complete.
Total items: 40
Successful: 40
Errors: 0


In [9]:
result_df.head(60)

Unnamed: 0,key,value
0,x50 Percocet 10mg Pharma Grade Fent Free,[Percocet]
1,Pre-loaded vape pen with Insanely Perfect Hear...,[Ayahuasca]
2,"100mg Metta NMT, a beautiful vapable MDMA repl...",[Metta]
3,Peyote mescaline microdose Liquid Vial 5g in 5...,[Peyote mescaline]
4,1KG-EUTYLONE (KU) CYSTALS Analogue of MDMA (Sh...,[]
5,DMT + NMT vape cart,[DMT]
6,DMT + NMT fumarate,[DMT]
7,DMT + NMT freebase,[DMT]
8,"ACACIA Changa Pre-rolls (DMT, NMT, MAOI)",[DMT]
9,NMT + DMT vape juice,[DMT]


In [51]:
result_df.iloc[21]['key']

'LSD Drug Testing Kit (+1 Fent Test Strip and +1 Testing Tray)'

In [53]:
list_of_dicts

[{'x50 Percocet 10mg Pharma Grade Fent Free': [],
  'Pre-loaded vape pen with Insanely Perfect Heart-openning Chakra smokable ayahuasca': [],
  '100mg Metta NMT, a beautiful vapable MDMA replacement. Heart Chakra opens for 1hr meditation.': [],
  'Peyote mescaline microdose Liquid Vial 5g in 5mL, 75 drops. Buk Discounts': [],
  '1KG-EUTYLONE (KU) CYSTALS Analogue of MDMA (Shipping USA to USA)': []},
 {'DMT + NMT vape cart': [],
  'DMT + NMT fumarate': [],
  'DMT + NMT freebase': [],
  'ACACIA Changa Pre-rolls (DMT, NMT, MAOI)': [],
  'NMT + DMT vape juice': []},
 {'RIPPED STACK Trenbolone Masterone testosterone 300mg/ml 10ml - Liniment Pharma': [],
  '50 x Xanax gg249 3mg bars Bromazolam US-US (50 Pills, 100 Pills and 200 Pills)': [],
  'HGH Human Growth Hormone 45IU Norditropin': [],
  'ACACIA changa (DMT, NMT, MAOI)': [],
  'ACACIA Pharmahuasca (DMT, NMT, MAOI)': []},
 {'10ML MUSHROOM DROPS!! Synthetic Shrooms 4-AcO-DMT UK': [],
  'HGH Omnitrope 72IU (24mg)': [],
  'Brown Sugar MDMA'

In [10]:
import os

output_folder = 'output/Main drug'
file_name = 'llama3-1_q8_batchsize_5.csv'
file_path = os.path.join(output_folder, file_name)

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

if os.path.exists(file_path):
    result_df.to_csv(file_path, mode='a', header=False, index=False)
else:
    result_df.to_csv(file_path, index=False)