In [1]:
# Import relevant functionality
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent
import os
import json

api_key = "AIzaSyAUHBpq6SjIZZqSYnjlvl7RT5p448-JUzw"

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    api_key = api_key,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2)

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
files_path = [os.path.join('JobDetailsV2',file ) for file in os.listdir("JobDetailsV2")]
print("Dataset Count: {}".format(len(files_path)))

Dataset Count: 126


In [4]:
input_data = [json.load(open(file_path, "r")) for file_path in files_path[:5]]
# input_data[0]

In [5]:
# input_data[:5]

In [6]:
# with open('Details.txt', 'w') as f:
#     f.write(str(input_data))

In [5]:
from google import genai
from pydantic import BaseModel
from typing import List, Optional, Dict


class Person(BaseModel):
    PersonName: str
    Email: str
    PostAgo: str
    Exp: str
    Location: str 
    Phone: Optional[str] = None
    Roles: List[str]
    PersonCurrentCompany: str
    CurrentCompany: str
    URLs: List[str]

In [6]:
len(files_path)

126

In [7]:
FinalJSON = []
file_count = len(files_path)
window = 10

for curr_idx in range(0, file_count, window):
    
    input_data = [json.load(open(file_path, "r")) for file_path in files_path[curr_idx: curr_idx+window]]

    # Usage
    prompt = f"""
    Given the following input data, extract the details and return them in a standardized JSON format:
    InputData: {input_data}
    Expected output format:
    [
    {{
        "PersonName": "Name of the person",
        "Email": "Officail email of person, or 'Nan' if not available ( @officalemail)"
        "PostAgo": "Time since post (e.g., '6 days', '1 Week')",
        "Exp": "Experience range (e.g., '0-2')",
        "Location": "Geographic location",
        "Phone": "Phone number or 'Nan' if not available",
        "Roles": ["Role 1", "Role 2", ...],
        "PersonCurrentCompany": "Current Company of Person Who posted or 'Nan' if not available",
         CurrentCompany: "Current Company URL or 'Nan' if not available"
        "URLs": ["URL 1", "URL 2", ...]
    }},
    {{
        "PersonName": "Name of the person",
        "Email": "Officail email of person, or 'Nan' if not available ( @officalemail)"
        "PostAgo": "Time since post (e.g., '6 days', '1 Week')",
        "Exp": "Experience range (e.g., '0-2')",
        "Location": "Geographic location",
        "Phone": "Phone number or 'Nan' if not available",
        "Roles": ["Role 1", "Role 2", ...],
        "PersonCurrentCompany": "Current Company of Person Who posted or 'Nan' if not available",
         CurrentCompany: "Current Company URL or 'Nan' if not available"
        "URLs": ["URL 1", "URL 2", ...]
    }}
    ]

    Note: 
    - Maintain consistent formatting for all fields
    - Use "Nan" for missing values
    - Ensure all URLs are properly formatted
    - Handle special characters in names and other fields
    - Use double quotes for both keys and values in the JSON output
    """

    client = genai.Client(api_key="AIzaSyAUHBpq6SjIZZqSYnjlvl7RT5p448-JUzw")
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=prompt,
        config={
            'response_mime_type': 'application/json',
            'response_schema': list[Person],
        },
    )

    # print(response.text).

    try:
        Json_data = json.loads(response.text)
        print("Success: Itr [{}:{}] : Text Data Loaded as JSON".format(curr_idx, curr_idx+window))
        try:
            FinalJSON.append(Json_data)
            print("Success: Itr [{}:{}] : JSON Data Appended Into all Post".format(curr_idx, curr_idx+window))
        except:
            print("Failed: Itr [{}:{}] : JSON Data not appended Into all Post".format(curr_idx, curr_idx+window))
    except:
           with open(f"{curr_idx}_{curr_idx + window}.txt", "w") as f:
               f.write(response.text)
           print("Failed: Itr [{}:{}] : Text Data not Loaded as JSON".format(curr_idx, curr_idx+window))


Success: Itr [0:10] : Text Data Loaded as JSON
Success: Itr [0:10] : JSON Data Appended Into all Post
Success: Itr [10:20] : Text Data Loaded as JSON
Success: Itr [10:20] : JSON Data Appended Into all Post
Success: Itr [20:30] : Text Data Loaded as JSON
Success: Itr [20:30] : JSON Data Appended Into all Post
Success: Itr [30:40] : Text Data Loaded as JSON
Success: Itr [30:40] : JSON Data Appended Into all Post
Success: Itr [40:50] : Text Data Loaded as JSON
Success: Itr [40:50] : JSON Data Appended Into all Post
Success: Itr [50:60] : Text Data Loaded as JSON
Success: Itr [50:60] : JSON Data Appended Into all Post
Success: Itr [60:70] : Text Data Loaded as JSON
Success: Itr [60:70] : JSON Data Appended Into all Post
Success: Itr [70:80] : Text Data Loaded as JSON
Success: Itr [70:80] : JSON Data Appended Into all Post
Success: Itr [80:90] : Text Data Loaded as JSON
Success: Itr [80:90] : JSON Data Appended Into all Post
Success: Itr [90:100] : Text Data Loaded as JSON
Success: Itr [90:

In [11]:
FlattenJSONList = list(map(lambda sublist: sublist, FinalJSON))
FlattenJSONList = [item for sublist in FlattenJSONList for item in sublist]
# FlattenJSONList

In [12]:
df = pd.DataFrame(FlattenJSONList)
df.shape

(126, 10)

In [16]:
df.to_csv("MayFistWeekv2.csv")

In [13]:
df[df['Email'] != 'Nan'].shape

(48, 10)

In [14]:
# df[df['Email'] != 'Nan']

In [15]:
df[df['Email'] != 'Nan'].to_csv("MayFistWeek.csv")

In [1]:
ProfileName = "Data Scientist"


'DataScientist'