In [1]:
import os
import re
import json
import time
import logging
import requests
from tqdm import tqdm 
from rich import print
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from datetime import datetime
load_dotenv()

# MongoDB connection settings
MONGO_URI = os.getenv("MONGO_URI")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
G2_API_KEY = os.getenv("G2_API_KEY")

# Configure logging
logging.basicConfig(filename='processing_software_suggest.log', level=logging.INFO,
                    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

In [2]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_openai import ChatOpenAI
chat = ChatOpenAI(temperature=0 , openai_api_key=OPENAI_API_KEY)

# Define the mapping from original keys to new keys
key_mapping = {
    'title': 'productName',
    'description': 'description',
    'price': 'rating',
    'image_url': 'photoUrl',
    'link': 'scarpedLink',
    'additional_info': 'additionalInfo',
    'website': 'website',
    'reviews': 'reviews'
}
# Initialize MongoDB client with server API version 1
category_list = ["Sales Tools", "Marketing", "Analytics Tools & Software", "Artificial Intelligence", "AR/VR", "B2B Marketplaces", "Business Services", "CAD & PLM", "Collaboration & Productivity", "Commerce", "Content Management", "Converged Infrastructure", "Customer Service", "Data Privacy", "Design", "Development", "Digital Advertising Tech", "Ecosystem Service Providers", "ERP", "Governance, Risk & Compliance", "Greentech", "Hosting", "HR", "IoT Management", "IT Infrastructure", "IT Management", "Marketing Services", "Marketplace Apps", "Office", "Other Services", "Professional Services", "Routers", "Security"]

client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client["g2"]

with open("./software_suggest_products.json" , encoding="utf-8") as product_chunk:
    data = json.load(product_chunk)
print(len(data))

  return rust_x509.load_der_x509_certificate(data)


In [3]:
def get_categories(transformed_data):
    
    if transformed_data['productName'] and transformed_data['description']:
        
        messages = [
            SystemMessage(
                content="You are an expert at identifying which software belongs to which categories"
            ),
            HumanMessage(
                content=f"""
                Given the following product : {transformed_data['productName']}
                and the product description : {transformed_data['description']}
                and the following categories to pick from : {category_list}
                
                which category does the product belong to
                
                Only respond with a list of categories the product belongs to
                """
            ),
        ]

        response_content=chat.invoke(messages).content
        pattern = r"\['(.*?)'\]"
        categories_match = re.findall(pattern, response_content)
        categories_list = categories_match[0].split("', '") if categories_match else []
        # only return those which are there in category_list
        if len(categories_list) != 0:
                    # Filter categories to only include those present in category_list
            filtered_categories = [cat for cat in categories_list if cat in category_list]

            if filtered_categories:
                return filtered_categories
            # return categories_list
        else:
            logging.error("no category picks from the LLM")
            return []
    else:
        logging.error("error occurse while generating category list")
        return []

def get_mode(transformed_data):
    mode_list = ["B2B" , "B2C"]
    
    if transformed_data['productName'] and transformed_data['description']:
        
        messages = [
            SystemMessage(
                content="You are an expert at identifying which software belongs to which type of business model"
            ),
            HumanMessage(
                content=f"""
                Given the following product : {transformed_data['productName']}
                and the product description : {transformed_data['description']}
                and the following categories to pick from : {mode_list}
                
                which model does the product belong to
                
                Only respond with a list of categories the product belongs to
                
                such as  ['B2B'] or ['B2C'] or ['B2B','B2C'] pick one
                """
            ),
        ]

        response_content=chat.invoke(messages).content
        pattern = r"\['(.*?)'\]"
        categories_match = re.findall(pattern, response_content)
        models_list = categories_match[0].split("', '") if categories_match else []
        # only return those which are there in category_list
        if len(models_list) != 0:
                    # Filter categories to only include those present in category_list
            # filtered_categories = [cat for cat in models_list if cat in mode_list]
            return models_list
            # if filtered_categories:
            #     return filtered_categories
            # return categories_list
        else:
            logging.error("no category picks from the LLM")
            return []
    else:
        logging.error("error occurse while generating category list")
        return []

def write_description(transformed_data):
    
    if transformed_data['productName'] and transformed_data['description'] :
        
        messages = [
            SystemMessage(
                content="You are an expert at software products"
            ),
            HumanMessage(
                content=f"""
                referring to this description of the software product {transformed_data['description']}
                write a detailed description about it possible for customers to understand what it is about the software product not more than 150 words
                """
            ),
        ]
        response_content=chat.invoke(messages).content
        return response_content

def g2_product_search(transformed_data):
    """
    Return True if the product is there in G2 and insert it into MongoDB if not already present.
    """

    url = f"https://data.g2.com/api/v1/products?filter[name]={transformed_data['productName']}"
    headers = {
        'Authorization': f'Bearer {G2_API_KEY}',
    }

    try:
        # Make API request to G2
        response = requests.get(url, headers=headers)
        response_data = response.json()

        # Check if there are any records matching the product name
        record_count = response_data["meta"]["record_count"]
        if record_count > 0:
            # Iterate over the returned data
            for g2_product in response_data["data"]:
                # Check if the product with the same "id" already exists in MongoDB
                existing_product = db['g2_products'].find_one({"id": g2_product["id"]})

                if not existing_product:
                    # Insert the transformed data into the MongoDB collection
                    g2_product["associatedProductName"] = transformed_data['productName']
                    db['g2_products'].insert_one(g2_product)
                    # print("Inserted product with id:", g2_product["id"])

            return True
        else:
            return False

    except requests.RequestException as e:
        logging.error("Error making API request:", e)
        return False

In [6]:

def process_data(original_data):
    transformed_data = {}
    for original_key, new_key in key_mapping.items():
        if original_key in original_data:
            transformed_data[new_key] = original_data[original_key]
            
    # Add additional fields to the transformed data
    transformed_data['similarProducts'] = []  # List of URLs (empty for now)
    transformed_data['contactMail'] = None  # Contact email (replace with actual email)
    try:
        transformed_data['reviews'] = [{'content': review['content']} for review in transformed_data['reviews']]  # Adjust reviews format    
    except:
        transformed_data['reviews'] = []
    # Convert the transformed data to JSON
    # Attempt to get categories; retry on failure
    while True:
        try:
            transformed_data['category'] = get_categories(transformed_data)  # List of categories (replace with actual categories)
            break  # Break out of the loop if successful
        except Exception as e:
            logging.error(f"Error getting categories: {e}")
            logging.info("Retrying to get categories...")
            time.sleep(3)  # Wait for 1 second before retrying
    
    # Log processing details to the file
    product_name = transformed_data.get('productName', 'Unknown Product')
    description = transformed_data.get('description', 'No description')
    categories = transformed_data.get('category', [])


    if g2_product_search(transformed_data):
        collection = db['scraped_products_2'] 
        insert_result = collection.insert_one(transformed_data)
        log_message = f"Processing Product: {product_name} | Description: {description} | Categories: {categories} | In G2: {True}"
    else:
        # print("shortlisted")
        while True:
            try:
                transformed_data['business_models'] = get_mode(transformed_data)  # List of categories (replace with actual categories)
                break  # Break out of the loop if successful
            except Exception as e:
                logging.error(f"Error getting categories: {e}")
                logging.info("Retrying to get categories...")
                time.sleep(3)  # Wait for 1 second before retrying
    
        transformed_data["additionalInfo"] = write_description(transformed_data)
        logging.info(f"Product '{product_name}' not found in G2")
        collection = db['scraped_products_2'] 
        insert_result = collection.insert_one(transformed_data)
        collection = db['filtered_products_2']  
        insert_result = collection.insert_one(transformed_data)
        log_message = f"Processing Product: {product_name} | Description: {description} | Categories: {categories} | In G2: {False}"
    logging.info(log_message)
    # print(transformed_data)

In [8]:
# #Process individual data from each product
# original_data = data[2]
# print(original_data)
# process_data(original_data)

for product_data in tqdm(data[10:], desc="Processing Products", unit="products"):
    process_data(product_data)

Processing Products:   0%|          | 16/16623 [00:48<13:40:32,  2.96s/products]

In [None]:
print(data[:10])

In [None]:
# """Code to delete everything in the database"""

# collection_names = ['scraped_products', 'filtered_products', 'g2_products']

# # Loop through each collection and delete all documents
# for collection_name in collection_names:
#     collection = db[collection_name]
#     result = collection.delete_many({})  # Delete all documents in the collection
#     print(f"Deleted {result.deleted_count} documents from '{collection_name}' collection")


In [None]:
# from pymongo import MongoClient

# # List of collection names to process
# collection_names = ['scraped_products', 'filtered_products', 'g2_products']

# def convert_rating_to_float(collection_name):
#     collection = db[collection_name]

#     # Iterate over each document in the collection
#     for document in collection.find():
#         # Check if the document contains a 'rating' field
#         if 'rating' in document:
#             try:
#                 # Convert 'rating' from string to float
#                 if document["rating"] == None:
#                     rating_float = float(-1)
#                 else:
#                     rating_float = float(document['rating'])
                
#                 # Update the document with the converted rating
#                 collection.update_one(
#                     {'_id': document['_id']},
#                     {'$set': {'rating': rating_float}}
#                 )
#             except ValueError:
#                 # Handle conversion error if 'rating' is not a valid float
#                 print(f"Error converting rating to float for document {_id} in {collection_name}")

# # Process each collection
# for collection_name in collection_names:
#     convert_rating_to_float(collection_name)

# # Close the MongoDB client connection
# # client.close()


In [1]:
!pip install leptonai



In [10]:
import os
from leptonai.client import Client

api_token = os.environ.get('LEPTON_API_TOKEN')
client = Client("e6a7hrns", "search", token=api_token)

result = client.query(
  query="what is the name of googles parent company",
  search_uuid="1214124"
)

# print(result)

result_str = result.decode("utf-8")
# result_dict = json.loads(result_str)

    # Return the parsed JSON dictionary as the response from the endpoint
print(result_str)

def parse_lepton_response(response_string):
    # Find the start and end indices of LLM_RESPONSE
    llm_response_start = response_string.find("__LLM_RESPONSE__") + len("__LLM_RESPONSE__")
    llm_response_end = response_string.find("__RELATED_QUESTIONS__")
    
    # Extract LLM_RESPONSE
    llm_response = response_string[llm_response_start:llm_response_end].strip()
  
    return llm_response
  
print(parse_lepton_response(result_str))

[{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "7 Companies Owned by Google's Parent Company Alphabet (GOOGL),", "url": "https://www.investopedia.com/investing/companies-owned-by-google/", "datePublished": "2024-02-03T00:00:00.0000000", "datePublishedDisplayText": "Feb 3, 2024", "isFamilyFriendly": true, "displayUrl": "https://www.investopedia.com/investing/companies-owned-by-google", "snippet": "Alphabet, Google's parent company, is a tech giant with a $1.78 trillion market cap. ... In 2018, Google retired the DoubleClick brand name and folded it into Google's AdWords brand, making Google ...", "dateLastCrawled": "2024-04-10T15:20:00.0000000Z", "cachedPageUrl": "http://cc.bingj.com/cache.aspx?q=what+is+the+name+of+googles+parent+company&d=4819670951404599&mkt=en-US&setlang=en-US&w=p6NRPaSNG6uPu50QWRwF8A1MI3ahCPiX", "language": "en", "isNavigational": true, "noCache": false}, {"id": "https://api.bing.microsoft.com/api/v7/#WebPages.1", "contractualRules": [{"_type"

AttributeError: 'bytes' object has no attribute 'to_dict'