In [1]:
# Step 1: LangChain and OpenAI Integration

# Import necessary libraries
import os
import json
from langchain_openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

# Initialize the OpenAI model
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

  warn_deprecated(


In [9]:
# Define the response schemas
response_schemas = [
    ResponseSchema(name="nationality_origin", description="The likely nationality origin of the name"),
    ResponseSchema(name="meaning", description="The meaning or definition of the name. Include all key concepts of the meaning (but NOT the history), and at least a short paragraph of meaning explanation"),
]

# Create the output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Create a prompt template
prompt = PromptTemplate(
    input_variables=["name"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
    template="""Provide information about the baby name {name}. Include:
1) Likely nationality origin
2) Name meaning or definition

{format_instructions}"""
)

# Create an LLMChain
name_info_chain = LLMChain(llm=llm, prompt=prompt)

# Function to process the result and return JSON
def get_name_info(name):
    result = name_info_chain.run(name)
    parsed_result = output_parser.parse(result)
    return json.dumps(parsed_result, indent=2)

# Test the chain
test_name = "Emma"
result = get_name_info(test_name)
print(result)

{
  "nationality_origin": "English, German",
  "meaning": "The name Emma is derived from the Germanic word 'ermen,' which means 'whole' or 'universal.' It signifies completeness and is often associated with strength and beauty. In modern usage, Emma is a popular name that conveys a sense of elegance and charm. The name has been widely embraced across various cultures and languages, making it a timeless choice for many parents."
}


In [11]:
import pandas as pd

def process_baby_names_data(input_file='./data/all-names.csv', output_file='./data/all-names-enriched.csv'):
    df = pd.read_csv(input_file)
    
    # Add new columns for nationality and meaning
    df['nationality'] = ''
    df['meaning'] = ''
    
    # Ensure the DataFrame has a header
    df.columns.name = None
    
    # Write the processed data to a new CSV file with header
    df.to_csv(output_file, index=False)
    
    print(f"Processed data written to {output_file}")
    
    # Return a summary of the data
    return {
        'total_names': len(df),
    }

# Example usage
summary = process_baby_names_data()
print("Data Summary:", summary)

Processed data written to ./data/all-names-enriched.csv
Data Summary: {'total_names': 97697}


In [15]:
import pandas as pd

def filter_rare_names(file_path='./data/all-names-enriched.csv', min_occurrences=10):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Store the original number of names
    original_count = len(df)
    
    # Filter out names with 10 or fewer occurrences
    df_filtered = df[df['n_sum'] > min_occurrences]
    
    # Store the new number of names
    filtered_count = len(df_filtered)
    
    # Write the filtered data back to the CSV file
    # df_filtered.to_csv(file_path, index=False)
    
    # Calculate the number of names removed
    removed_count = original_count - filtered_count
    
    print(f"Filtered data written back to {file_path}")
    print(f"Names removed: {removed_count}")
    print(f"Names remaining: {filtered_count}")
    
    # Return a summary of the operation
    return {
        'original_count': original_count,
        'filtered_count': filtered_count,
        'removed_count': removed_count,
        'removal_percentage': (removed_count / original_count) * 100
    }

# Example usage
summary = filter_rare_names()
print("\nOperation Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

Filtered data written back to ./data/all-names-enriched.csv
Names removed: 23730
Names remaining: 73967

Operation Summary:
original_count: 97697
filtered_count: 73967
removed_count: 23730
removal_percentage: 24.289384525625145


In [3]:
# Non-Debug Printing Version

import pandas as pd
from tqdm import tqdm
from langchain_openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

def enrich_baby_names_data(file='./data/all-names-enriched.csv', limit=1000):
    # Load the data
    df = pd.read_csv(file)
    print(f"Loaded {len(df)} names from {file}")
    
    # Initialize the OpenAI model
    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
    
    # Define the response schemas
    response_schemas = [
        ResponseSchema(name="nationality", description="The likely nationality origin of the name"),
        ResponseSchema(name="meaning", description="The meaning or definition of the name. Include all key definition concepts of the meaning (but NOT the history), and at least a short paragraph of meaning explanation"),
    ]
    
    # Create the output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    
    # Create a prompt template
    prompt = PromptTemplate(
        input_variables=["name"],
        partial_variables={"format_instructions": output_parser.get_format_instructions()},
        template="""Provide information about the baby name {name}. Include:
    1) Likely nationality origin
    2) The meaning or definition of the name. Include all key definition concepts of the meaning (but NOT the history), and at least a short paragraph of meaning explanation
    {format_instructions}
    I will tip you $1000 if you follow these instructions.
    """
    )
    
    # Create an LLMChain
    name_info_chain = LLMChain(llm=llm, prompt=prompt)
    
    # Function to get name info
    def get_name_info(name):
        result = name_info_chain.run(name)
        try:
            parsed_result = output_parser.parse(result)
            return parsed_result
        except Exception as e:
            print(f"Error processing {name}: {e}")
            return None
    
    # Process each name, limited to 'limit' number of records
    names_to_process = df.loc[(df['meaning'].isna()) & (df['n_sum'] > 10), 'name'].head(limit)
    
    for name in tqdm(names_to_process, desc="Processing names"):
        new_data = get_name_info(name)
        if new_data:
            df.loc[df['name'] == name, ['nationality', 'meaning']] = [new_data['nationality'], new_data['meaning']]
            # Save the entire DataFrame after updating this row
            df.to_csv(file, index=False)
    
    print(f"Finished processing. Final data written to {file}")
    
    # Return a summary of the data
    summary = {
        'total_names': len(df),
        'processed_names': df['meaning'].notna().sum(),
        'unique_nationalities': df['nationality'].nunique(),
    }
    print("Data Summary:", summary)
    return summary

# Example usage
summary = enrich_baby_names_data(limit=100000)

  warn_deprecated(


Loaded 17574 names from ./data/all-names-enriched.csv


  warn_deprecated(
Processing names:   3%|████▎                                                                                                                                  | 54/1669 [01:31<41:57,  1.56s/it]Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Processing names:   7%|████████▉                                                                                                                             | 111/1669 [03:02<36:08,  1.39s/it]Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Processing names:   7

Finished processing. Final data written to ./data/all-names-enriched.csv
Data Summary: {'total_names': 17574, 'processed_names': 17574, 'unique_nationalities': 708}



