In [3]:
import os
import re
import json
import logging
import requests
from tqdm import tqdm 
from rich import print
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from datetime import datetime
load_dotenv()

# MongoDB connection settings
MONGO_URI = os.getenv("MONGO_URI")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
G2_API_KEY = os.getenv("G2_API_KEY")

# Configure logging
logging.basicConfig(filename='processing.log', level=logging.INFO,
                    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

In [4]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_openai import ChatOpenAI
chat = ChatOpenAI(temperature=0 , openai_api_key=OPENAI_API_KEY)

# Define the mapping from original keys to new keys
key_mapping = {
    'title': 'productName',
    'description': 'description',
    'price': 'rating',
    'image_url': 'photoUrl',
    'link': 'scarpedLink',
    'additional_info': 'additionalInfo',
    'website': 'website',
    'reviews': 'reviews'
}
# Initialize MongoDB client with server API version 1
category_list = ["Sales Tools", "Marketing", "Analytics Tools & Software", "Artificial Intelligence", "AR/VR", "B2B Marketplaces", "Business Services", "CAD & PLM", "Collaboration & Productivity", "Commerce", "Content Management", "Converged Infrastructure", "Customer Service", "Data Privacy", "Design", "Development", "Digital Advertising Tech", "Ecosystem Service Providers", "ERP", "Governance, Risk & Compliance", "Greentech", "Hosting", "HR", "IoT Management", "IT Infrastructure", "IT Management", "Marketing Services", "Marketplace Apps", "Office", "Other Services", "Professional Services", "Routers", "Security"]

client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
db = client["g2"]

with open("./products_chunk_1.json" , encoding="utf-8") as product_chunk:
    data = json.load(product_chunk)
print(len(data))

  return rust_x509.load_der_x509_certificate(data)


In [None]:
def get_categories(transformed_data):
    
    if transformed_data['productName'] and transformed_data['description']:
        
        messages = [
            SystemMessage(
                content="You are an expert at identifying which software belongs to which categories"
            ),
            HumanMessage(
                content=f"""
                Given the following product : {transformed_data['productName']}
                and the product description : {transformed_data['description']}
                and the following categories to pick from : {category_list}
                
                which category does the product belong to
                
                Only respond with a list of categories the product belongs to
                """
            ),
        ]

        response_content=chat.invoke(messages).content
        pattern = r"\['(.*?)'\]"
        categories_match = re.findall(pattern, response_content)
        categories_list = categories_match[0].split("', '") if categories_match else []
        if len(categories_list) != 0:
            return categories_list
        else:
            print("no category picks from the LLM")
            return []
    else:
        print("error occurse while generating category list")
        return []
    
def write_description(transformed_data):
    
    if transformed_data['productName'] and transformed_data['description'] and transformed_data['additionalInfo']:
        
        messages = [
            SystemMessage(
                content="You are an expert at software products"
            ),
            HumanMessage(
                content=f"""
                referring to this description of the software product {transformed_data['description']} and {transformed_data['additionalInfo']}
                write a should be as detailed as possible for customers to understand what it is about the software product
                """
            ),
        ]
        response_content=chat.invoke(messages).content
        return response_content
    
def g2_product_search(transformed_data):
    """
    Return True if the product is there in G2 and insert it into MongoDB if not already present.
    """

    url = f"https://data.g2.com/api/v1/products?filter[name]={transformed_data['productName']}"
    headers = {
        'Authorization': f'Bearer {G2_API_KEY}',
    }

    try:
        # Make API request to G2
        response = requests.get(url, headers=headers)
        response_data = response.json()

        # Check if there are any records matching the product name
        record_count = response_data["meta"]["record_count"]
        if record_count > 0:
            # Iterate over the returned data
            for g2_product in response_data["data"]:
                # Check if the product with the same "id" already exists in MongoDB
                existing_product = db['g2_products'].find_one({"id": g2_product["id"]})

                if not existing_product:
                    # Insert the transformed data into the MongoDB collection
                    g2_product["associatedProductName"] = transformed_data['productName']
                    db['g2_products'].insert_one(g2_product)
                    # print("Inserted product with id:", g2_product["id"])

            return True
        else:
            return False

    except requests.RequestException as e:
        print("Error making API request:", e)
        return False

In [None]:
def process_data(original_data):
    transformed_data = {}
    for original_key, new_key in key_mapping.items():
        if original_key in original_data:
            transformed_data[new_key] = original_data[original_key]
            
    # Add additional fields to the transformed data
    transformed_data['similarProducts'] = []  # List of URLs (empty for now)
    transformed_data['contactMail'] = None  # Contact email (replace with actual email)
    transformed_data['reviews'] = [{'content': review['content']} for review in transformed_data['reviews']]  # Adjust reviews format

    transformed_data['category'] = get_categories(transformed_data)  # List of categories (replace with actual categories)
    transformed_data["additionalInfo"] = write_description(transformed_data)
    # Convert the transformed data to JSON
    
    # Log processing details to the file
    product_name = transformed_data.get('productName', 'Unknown Product')
    description = transformed_data.get('description', 'No description')
    categories = transformed_data.get('category', [])


    if g2_product_search(transformed_data):
        collection = db['scraped_products'] 
        insert_result = collection.insert_one(transformed_data)
        log_message = f"Processing Product: {product_name} | Description: {description} | Categories: {categories} | In G2: {True}"
    else:
        logging.info(f"Product '{product_name}' not found in G2")
        collection = db['scraped_products'] 
        insert_result = collection.insert_one(transformed_data)
        collection = db['filtered_products']  
        insert_result = collection.insert_one(transformed_data)
        log_message = f"Processing Product: {product_name} | Description: {description} | Categories: {categories} | In G2: {False}"
    logging.info(log_message)

In [None]:
# #Process individual data from each product
# original_data = data[2]
# print(original_data)
# process_data(original_data)

for product_data in tqdm(data, desc="Processing Products", unit="products"):
    process_data(product_data)

In [None]:
# """Code to delete everything in the database"""

# collection_names = ['scraped_products', 'filtered_products', 'g2_products']

# # Loop through each collection and delete all documents
# for collection_name in collection_names:
#     collection = db[collection_name]
#     result = collection.delete_many({})  # Delete all documents in the collection
#     print(f"Deleted {result.deleted_count} documents from '{collection_name}' collection")


In [6]:
# from pymongo import MongoClient

# # List of collection names to process
# collection_names = ['scraped_products', 'filtered_products', 'g2_products']

# def convert_rating_to_float(collection_name):
#     collection = db[collection_name]

#     # Iterate over each document in the collection
#     for document in collection.find():
#         # Check if the document contains a 'rating' field
#         if 'rating' in document:
#             try:
#                 # Convert 'rating' from string to float
#                 if document["rating"] == None:
#                     rating_float = float(-1)
#                 else:
#                     rating_float = float(document['rating'])
                
#                 # Update the document with the converted rating
#                 collection.update_one(
#                     {'_id': document['_id']},
#                     {'$set': {'rating': rating_float}}
#                 )
#             except ValueError:
#                 # Handle conversion error if 'rating' is not a valid float
#                 print(f"Error converting rating to float for document {_id} in {collection_name}")

# # Process each collection
# for collection_name in collection_names:
#     convert_rating_to_float(collection_name)

# # Close the MongoDB client connection
# # client.close()
