In [1]:
import pandas as pd
import json
import re
from pydantic import BaseModel, Field, ValidationError, field_validator
from typing import List, Dict, Any
import ast

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import Ollama



In [2]:
listings = pd.read_csv('data/data_uncategorised_Oct21_to_Jan22.csv')
#listings.head()
unique_listings=pd.DataFrame(listings['drug_title'].unique(),columns=['listing'])

  listings = pd.read_csv('data/data_uncategorised_Oct21_to_Jan22.csv')


In [16]:
def dataframe_batches(df, batch_size=10):
    column_name = df.columns[0]
    total_rows = len(df)
    results = []

    for start in range(0, total_rows, batch_size):
        end = min(start + batch_size, total_rows)
        batch = df.iloc[start:end]
        
        batch_result = {str(element): [] for element in batch[column_name]}
        results.append(batch_result)

    return results

class ResponseChecks(BaseModel):
    data: Dict[str, List[Any]]
    
    @field_validator("data")
    def check_dict_format(cls, value):
        assert isinstance(value, dict), "Response must be a dictionary"
        assert len(value) > 0, "Dictionary must not be empty"
        for key, val in value.items():
            assert isinstance(val, list), f"Value for key '{key}' must be a list"
        return value

def process_invalid_json_response(response_text):
    """
    Process an invalid JSON response that contains unwanted text before the JSON-like structure.
    """
    match = re.search(r'\{[\s\S]*\}', response_text)
    
    if match:
        json_like_str = match.group(0)
        json_str = json_like_str.replace("'", '"')
        
        try:
            data = json.loads(json_str)
            return data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None
    else:
        print("No valid JSON-like structure found in the response.")
        return None

def execute_keyword_LLM(list_of_dicts, chain):
    results = []
    error_listings = []
    total_items = len(list_of_dicts)
    
    for index, item in enumerate(list_of_dicts, 1):
        if index % 10 == 0 or index == total_items:
            print(f"Progress: {index}/{total_items} items processed")
        
        try:
            response = chain.run(test_data=item)
            
            # First, try to process as a valid JSON response
            try:
                output = ast.literal_eval(response)
            except (ValueError, SyntaxError):
                # If ast.literal_eval fails, try processing as invalid JSON
                output = process_invalid_json_response(response)
                if output is None:
                    raise ValueError("Failed to process response as JSON")
            
            # Validate the output
            ResponseChecks(data=output)
            
            # If validation passes, add to results
            for key, value in output.items():
                results.append({"key": key, "value": value})
        
        except Exception as e:
            print(f"Error processing item {index}: {str(e)}")
            error_listings.append(item)
            continue
    
    # Create DataFrame from results
    df = pd.DataFrame(results)
    
    # Print summary
    print(f"\nProcessing complete.")
    print(f"Total items: {total_items}")
    print(f"Successful: {len(list_of_dicts) - len(error_listings)}")
    print(f"Errors: {len(error_listings)}")
    
    return df, error_listings

In [25]:
#Break the data frame into batches and convert it to dict for processing
list_of_dicts=dataframe_batches(unique_listings)

In [31]:
list_of_dicts = list_of_dicts[:20]

In [23]:
# Define the input and output formats
input_format = r"""{{
  'listing text': [],
  'listing text': [],
  ...
}}"""

output_format = r"""{{
  'listing text': ['keyword1', 'keyword2', ...],
  'listing text': ['keyword1', 'keyword2', ...],
  ...
}}"""

example_input = r"""{{
  '1 Blister Codein 50mg Codeinium': [],
  '65 sample - choose what u want, mix and match': [],
  'SPECIAL PROMO: 500x S903-4mg Hulk Bars': [],
  '7G L.S.D A+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': [],
  '3.5-56G GREEN CRACKA+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': [],
  'Pure Cocaine 98% purity, 5 grams available': [],
  'Acid tabs, blotters, LSD, 25mg each': [],
  '50 grams Cannabis buds - premium quality': [],
  'Xanax bars, 10mg - pharma grade': [],
  'Variety of pills and powders, 100% satisfaction guaranteed': [],
  'BD N BSM TEST BUNDLE 1OZ -indoor only- A++': []
}}"""

example_output = r"""{{
  '1 Blister Codein 50mg Codeinium': ['Codein', 'Codeinium'],
  '65 sample - choose what u want, mix and match': ['NA'],
  'SPECIAL PROMO: 500x S903-4mg Hulk Bars': ['S903', 'Hulk Bars'],
  '7G L.S.D A+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': ['LSD'],
  '3.5-56G GREEN CRACKA+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': ['green crack'],
  'Pure Cocaine 98% purity, 5 grams available': ['Cocaine'],
  'Acid tabs, blotters, LSD, 25mg each': ['Acid', 'LSD'],
  '50 grams Cannabis buds - premium quality': ['Cannabis'],
  'Xanax bars, 10mg - pharma grade': ['Xanax'],
  'Variety of pills and powders, 100% satisfaction guaranteed': ['NA'],
  'BD N BSM TEST BUNDLE 1OZ -indoor only- A++': ['BD', 'BSM']
}}"""

# Define the template for the prompt
template = f"""Act as an NLP specialist, and your task is to extract keywords from the given substance listings scraped from the dark web. Keywords can include the main item of the listing, and/or any chemical name or compound mentioned, and/or any slang term related to illicit substances. General or common words in the listings like ‘ounces,’ ‘variety,’ ‘after,’ etc., are not considered keywords unless they are part of a recognized slang term. There could be more than 1 keyword in a given listing. 

Input format:
{input_format}

Output format:
{output_format}

Example

Input:
{example_input}

Output:
{example_output}

Now perform the task on the following listings and strictly follow the output format and task instructions given above. Remember there can be more than one keywords as well for a given listing. Please don't hallucinate and don't extract imaginary keywords. Listings data:

{{test_data}}
"""

In [36]:
#Llama 3 prompt
# Define the input and output formats
input_format = r"""{{
  'listing text': [],
  'listing text': [],
  ...
}}"""

output_format = r"""{{
  'listing text': ['keyword1', 'keyword2', ...],
  'listing text': ['keyword1', 'keyword2', ...],
  ...
}}"""

example_input = r"""{{
  '1 Blister Codein 50mg Codeinium': [],
  '65 sample - choose what u want, mix and match': [],
  'SPECIAL PROMO: 500x S903-4mg Hulk Bars': [],
  '7G L.S.D A+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': [],
  '3.5-56G GREEN CRACKA+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': [],
  'Pure Cocaine 98% purity, 5 grams available': [],
  'Acid tabs, blotters, LSD, 25mg each': [],
  '50 grams Cannabis buds - premium quality': [],
  'Xanax bars, 10mg - pharma grade': [],
  'Variety of pills and powders, 100% satisfaction guaranteed': [],
  'BD N BSM TEST BUNDLE 1OZ -indoor only- A++': []
}}"""

example_output = r"""{{
  '1 Blister Codein 50mg Codeinium': ['Codein', 'Codeinium'],
  '65 sample - choose what u want, mix and match': ['NA'],
  'SPECIAL PROMO: 500x S903-4mg Hulk Bars': ['S903', 'Hulk Bars'],
  '7G L.S.D A+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': ['LSD'],
  '3.5-56G GREEN CRACKA+++ TOP PRICES IN UK!!! NDD-FREE POSTAGE': ['green crack'],
  'Pure Cocaine 98% purity, 5 grams available': ['Cocaine'],
  'Acid tabs, blotters, LSD, 25mg each': ['Acid', 'LSD'],
  '50 grams Cannabis buds - premium quality': ['Cannabis'],
  'Xanax bars, 10mg - pharma grade': ['Xanax'],
  'Variety of pills and powders, 100% satisfaction guaranteed': ['NA'],
  'BD N BSM TEST BUNDLE 1OZ -indoor only- A++': ['BD', 'BSM']
}}"""

# Define the template for the prompt
template = f"""Extract keywords from the given substance listings scraped from the dark web. Keywords can include the main item of the listing, and/or any chemical name or compound mentioned, and/or any slang term related to illicit substances. General or common words in the listings like ‘ounces,’ ‘variety,’ ‘after,’ etc., are not considered keywords unless they are part of a recognized slang term.  

Input format:
{input_format}

Output format:
{output_format}

Example

Input:
{example_input}

Output:
{example_output}

Now perform the task on the following listings and strictly follow the output format and task instructions given above. Please don't hallucinate and don't extract imaginary keywords. Listings data:

{{test_data}}
"""

In [43]:
# Create a PromptTemplate instance
prompt = PromptTemplate(template=template, input_variables=["test_data"])

# Initialize the Ollama LLM with the correct model name
llm = Ollama(model="dolphin-llama3")

# Create the LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

# Run the chain with the first element of list_of_dicts
#response = chain.run(test_data=list_of_dicts[1])
#try:
#    output=ast.literal_eval(response)
#except:
#    output=''
#    print('Invalid Response')
#print(response)

In [44]:
from datetime import datetime
tstart = datetime.now()
result_df, errors_df = execute_keyword_LLM(list_of_dicts, chain)
print (datetime.now()-tstart)


Progress: 10/20 items processed
Error processing item 14: 1 validation error for ResponseChecks
data
  Input should be a valid dictionary [type=dict_type, input_value=({'3.5-56g Incredible Bul...['ENEMY OF THE STATE']}), input_type=tuple]
    For further information visit https://errors.pydantic.dev/2.8/v/dict_type
Error decoding JSON: Expecting ':' delimiter: line 6 column 92 (char 418)
Error processing item 19: Failed to process response as JSON
Progress: 20/20 items processed

Processing complete.
Total items: 20
Successful: 18
Errors: 2
0:01:30.067192


In [41]:
result_df.head(60)

Unnamed: 0,key,value
0,1 Blister Codein 50mg Codeinium,"[Codein, Codeinium]"
1,"65 sample - choose what u want, mix and match",[NA]
2,SPECIAL PROMO: 500x S903-4mg Hulk Bars,"[S903, Hulk Bars]"
3,50 B707 BLUE BARS,"[B707, Blue Bars]"
4,TERCIAN 30x25mg,[TERCIAN]
5,TERCIAN 10x25mg,[TERCIAN]
6,500 Pack Green S903 2.5 - 3.5Mg Per Bar Extras...,"[S903, Green Bars]"
7,250 Pack Green S903 2.5 - 3.5Mg Per Bar Extras...,"[S903, Green Bars]"
8,1000 Pack Green S903 2.5 - 3.5Mg Per Bar Extra...,"[S903, Green Bars]"
9,Eszopiclone (2 mg tabs). Generics 20 tabs,[Eszopticlon]


In [42]:
result_df.tail(60)

Unnamed: 0,key,value
114,Purple Panty Dropper - Ounces - OZ - Major bul...,[Ounce]
115,100 gr CARMELO,"[CarmeLo, gram]"
116,1 oz Macmosa,"[Macmosa, ounce]"
117,aus2aus - G13 28G- ozbabe,"[G13, ice wreck, ounces, ozbabe]"
118,aus2aus - ICE WRECK 14G- ozbabe,"[Ice Wreck, grams, ounces, ozbabe]"
119,500g Star Dawg 3250 GBP,"[Star Dawg, gram, pound]"
120,aus2aus - EL PRESIDENTE 14G- ozbabe,"[EL Presidente, ice wreck, ounces, ozbabe]"
121,Blue Satellite 224g Free Shipping!!!,"[grams, free shipping]"
122,1/4 lb ICE CREAM SOCIAL,"[quarter pounder, pound, ice cream social]"
123,1/4 lb Macmosa,"[Macmosa, quarter pounder]"


In [45]:
result_df.head(60)

Unnamed: 0,key,value
0,1 Blister Codein 50mg Codeinium,"[Codein, Codeinium]"
1,"65 sample - choose what u want, mix and match",[NA]
2,SPECIAL PROMO: 500x S903-4mg Hulk Bars,"[S903, Hulk Bars]"
3,50 B707 BLUE BARS,"[B707, BLUE BARS]"
4,TERCIAN 30x25mg,[Teracin]
5,TERCIAN 10x25mg,[Teracin]
6,500 Pack Green S903 2.5 - 3.5Mg Per Bar Extras...,[Green S903]
7,250 Pack Green S903 2.5 - 3.5Mg Per Bar Extras...,[Green S903]
8,1000 Pack Green S903 2.5 - 3.5Mg Per Bar Extra...,[Green S903]
9,100 Pack Green S903 2.5 - 3.5Mg Per Bar Extras...,[Green S903]


In [46]:
result_df.tail(60)

Unnamed: 0,key,value
119,ChemDogg CD,[ChemDogg]
120,aus2aus - MARMALADE 3.5G- ozbabe,[Marmalade]
121,Blue Satellite 56g Free Shipping!!!,[Blue Satellite]
122,1/4 LB CHERRYWANNA,[CherryWanna]
123,aus2aus - ICE WRECK 7G- ozbabe,[Ice Wreck]
124,14g Gelato 41 121.1 GBP,"[Gelato, 41]"
125,aus2aus - TROPIC TRUFFLE 14G- ozbabe,[Tropic Truffle]
126,1g MOONROCK,[Moonrock]
127,28g Green Crack 220 GBP,[Green Crack]
128,3 gr CARMELO,[Carmelo]


In [12]:
import os

output_folder = 'output'
file_name = 'gemma2_10.csv'
file_path = os.path.join(output_folder, file_name)

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

if os.path.exists(file_path):
    result_df.to_csv(file_path, mode='a', header=False, index=False)
else:
    result_df.to_csv(file_path, index=False)

In [13]:
list_of_dicts=dataframe_batches(unique_listings)
list_of_dicts = list_of_dicts[501:1300]
result_df, errors_df = execute_keyword_LLM(list_of_dicts, chain)

import os

output_folder = 'output'
file_name = 'gemma2_10.csv'
file_path = os.path.join(output_folder, file_name)

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

if os.path.exists(file_path):
    result_df.to_csv(file_path, mode='a', header=False, index=False)
else:
    result_df.to_csv(file_path, index=False)

Progress: 10/799 items processed
Progress: 20/799 items processed
Progress: 30/799 items processed
Progress: 40/799 items processed
Progress: 50/799 items processed
Progress: 60/799 items processed
Progress: 70/799 items processed
Progress: 80/799 items processed
Progress: 90/799 items processed
Progress: 100/799 items processed
Progress: 110/799 items processed
Progress: 120/799 items processed
Progress: 130/799 items processed
Progress: 140/799 items processed
Progress: 150/799 items processed
Progress: 160/799 items processed
Progress: 170/799 items processed
Progress: 180/799 items processed
Progress: 190/799 items processed
Progress: 200/799 items processed
Progress: 210/799 items processed
Progress: 220/799 items processed
Progress: 230/799 items processed
Progress: 240/799 items processed
Progress: 250/799 items processed
Progress: 260/799 items processed
Progress: 270/799 items processed
Progress: 280/799 items processed
Progress: 290/799 items processed
Error decoding JSON: Ex