# Assignment of NAICS codes to Kickstarter projects

In [4]:
import json
import pandas as pd
import time
import os
import re
import anthropic

In [3]:
# Enter API key
client = anthropic.Anthropic(
    api_key="",
)

In [3]:
# Don't truncate display
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)

In [4]:
# File path for input data file
input_path = '../data/csv_files/bestm_cleaned.csv'

# Output path to store results in csv file format
output_path = '../data/processed/claude'

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
# Extract filename
filename_with_extension = input_path.split('/')[-1]
filename = filename_with_extension.split('.')[0]

In [6]:
# Set up function for API call
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        temperature=0.0,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return message.content[0].text

# Assign NAICS codes

In [7]:
# Instructions for Claude 
instructions = '''
You are an industry expert with deep and extensive knowledge 
and understanding of the North American Industry Classification System
(NAICS). 

Your job is to find the most appropriate 2017 NAICS code for a business idea,
given the business' name, description, business category, 
and business subcategory. 

Find the closest 4-digit 2017 NAICS code
for a business with the given characteristics.

Return the result in valid JSON object format,
with the single key "naics_code".

Return only the JSON object containing the NAICS code, 
and no other text, code, or explanation.
'''

In [8]:
# Read contents of input filename into dataframe
df = pd.read_csv(input_path)
df = df.fillna('')
# Convert human rater NAICS code to string
df['CODE1'] = df['CODE1'].astype(str)

In [9]:
# Display relevant columns of input data:
# ID, name, blurb, category, subcategory, code assigned by human rater
df[['id', 'name', 'blurb', 'category', 'subcategory', 'CODE1']]

Unnamed: 0,id,name,blurb,category,subcategory,CODE1
0,1038000179,Tube Shoot,Invade your favorite TV shows with guns blazing in this highly original shooter game for your mobile device.,Games,Video games,5112
1,1042414789,"Adam & Paul Save the Whole, Entire Apartment Complex","Two pop culture geeks set out to save their apartment complex during the zombie apocalypse, because saving the world is too much work!",Film & video,Webseries,5121
2,1060166680,Inspired,"Inspired by a lifetime of collaboration, I am finally recording an album of my own with the help of great musicians & friends.",Music,Jazz,7111
3,1070560167,More Business of Being Born - Ricki Lake and Abby Epstein,Filmmakers Ricki Lake and Abby Epstein take a deeper look into the controversies and choices surrounding modern childbirth.,Film & video,Documentary,5121
4,1073620864,Rooted in Alameda,"A multimedia project lifting up the history of African Americans in Alameda, California.",Journalism,Audio,5191
5,1098244109,Ghost Vinyl: Insert Meme Here,"Just some kids from butthole, Arizona, looking to become a great indie band...in the eyes of an angel, we need you.",Music,Indie rock,7111
6,1106106046,SnapPower ConnectLight,"Linkable, motion activated pathway lighting...that installs in seconds. Transform the way you light your home at night!",Technology,,3351
7,1111970353,Swing Lab - Modular Porch Swing,A modular porch swing that allows the user to configure multiple combinations of seating arrangements.,Crafts,Woodworking,3371
8,1137873695,Hugo the Happy Starfish: THE LAST BULLY,Hugo the Happy Starfish is an educational children’s book series by Suzy Liebermann featuring THE LAST BULLY and the Anti-Bully Club,Publishing,Children's books,5111
9,1139821552,The Funklet,Graphic notations of twenty classic funk beats.,Music,,7111


In [10]:
# Lists to store 4-digit 2017 NAICS codes  
naics_4_digit_codes = []
naics_4_digit_descriptions = []

In [11]:
%%time

# Number of runs
num_runs = 3

# List to store result dataframes
list_df = []

for run in range(num_runs):

    print(f'Starting run {run} ...')
    
    # List to collect json objects
    list_json = []
    
    for index, row in df.iterrows():
        
        if index%10==0:
            print(f'Processing row {index} ...')
        
        name = row['name']
        blurb = row['blurb']
        category = row['category']
        subcategory = row['subcategory']

        # Gather relevant business characteristics from input data
        business_characteristics = f'''
            Name: {name}
            Description: {blurb}
            Business category: {category}
            Business subcategory: {subcategory}
            '''

        # Create prompt
        prompt = instructions + ' Business characteristics: ' + business_characteristics

        # Call the API
        response = get_completion(prompt)

        # Post-processing
        result = re.search(r'\{.*\}', response, re.DOTALL)
        json_data_string = result.group()

        # Read data into JSON object 
        data = json.loads(json_data_string)

        # Append to list of JSON objects
        list_json.append(data)
        
    # Output dataframe from this run
    df_run = pd.DataFrame(list_json)

    # Save dataframe to csv file
    df_run.to_csv(f'{output_path}/{filename}_run_{run}.csv', index=False)

    # Append to list of dataframes
    list_df.append(df_run)
    
    print(f'Finished run {run}.\n')

Starting run 0 ...
Processing row 0 ...
Processing row 10 ...
Processing row 20 ...
Processing row 30 ...
Processing row 40 ...
Processing row 50 ...
Processing row 60 ...
Processing row 70 ...
Processing row 80 ...
Processing row 90 ...
Processing row 100 ...
Processing row 110 ...
Processing row 120 ...
Processing row 130 ...
Processing row 140 ...
Finished run 0.

Starting run 1 ...
Processing row 0 ...
Processing row 10 ...
Processing row 20 ...
Processing row 30 ...
Processing row 40 ...
Processing row 50 ...
Processing row 60 ...
Processing row 70 ...
Processing row 80 ...
Processing row 90 ...
Processing row 100 ...
Processing row 110 ...
Processing row 120 ...
Processing row 130 ...
Processing row 140 ...
Finished run 1.

Starting run 2 ...
Processing row 0 ...
Processing row 10 ...
Processing row 20 ...
Processing row 30 ...
Processing row 40 ...
Processing row 50 ...
Processing row 60 ...
Processing row 70 ...
Processing row 80 ...
Processing row 90 ...
Processing row 100 ...

In [12]:
# Load NAICS code file
naics_file_path = '../data/naics_2017_codes.csv'  

# Read file into dataframe
df_naics = pd.read_csv(naics_file_path)

In [13]:
# Display NAICS code file
df_naics

Unnamed: 0,code,description
0,11,"Agriculture, Forestry, Fishing and Hunting"
1,111,Crop Production
2,1111,Oilseed and Grain Farming
3,11111,Soybean Farming
4,111110,Soybean Farming
5,11112,Oilseed (except Soybean) Farming
6,111120,Oilseed (except Soybean) Farming
7,11113,Dry Pea and Bean Farming
8,111130,Dry Pea and Bean Farming
9,11114,Wheat Farming


### Find NAICS code descriptions corresponding to numerical codes

In [14]:
# Create dictionary for NAICS code description lookup
naics_dict = df_naics.set_index('code')['description'].to_dict()

# Function to map a NAICS codes to descriptions
def map_naics_to_description(code):
    return naics_dict.get(str(code), 'Code not found')

### Create output dataframe

In [15]:
# Select columns from input dataframe
df_orig = df[['id', 'name', 'blurb', 'category', 'subcategory', 'CODE1']].rename(columns={'CODE1': 'human coder code'})

# Select columns from the 3 runs
df_run_1 = list_df[0][['naics_code']].rename(columns={'naics_code': 'genai run 1 code'})
df_run_2 = list_df[1][['naics_code']].rename(columns={'naics_code': 'genai run 2 code'})
df_run_3 = list_df[2][['naics_code']].rename(columns={'naics_code': 'genai run 3 code'})

# Concatenate these columns into a single dataframe
df_concat = pd.concat([df_orig, df_run_1, df_run_2, df_run_3], axis=1)

# Columns containing NAICS codes
naics_code_columns = ['human coder code', 'genai run 1 code', 'genai run 2 code', 'genai run 3 code']

# Add columns containing descriptions of NAICS codes
for col in naics_code_columns:
    df_concat[f'{col} description'] = df_concat[col].map(map_naics_to_description)

In [16]:
# Save dataframe to csv file
df_concat.to_csv(f'{output_path}/{filename}_genai.csv', index=False)

# Display NAICS codes

In [17]:
df_concat

Unnamed: 0,id,name,blurb,category,subcategory,human coder code,genai run 1 code,genai run 2 code,genai run 3 code,human coder code description,genai run 1 code description,genai run 2 code description,genai run 3 code description
0,1038000179,Tube Shoot,Invade your favorite TV shows with guns blazing in this highly original shooter game for your mobile device.,Games,Video games,5112,5112,5112,5112,Software Publishers,Software Publishers,Software Publishers,Software Publishers
1,1042414789,"Adam & Paul Save the Whole, Entire Apartment Complex","Two pop culture geeks set out to save their apartment complex during the zombie apocalypse, because saving the world is too much work!",Film & video,Webseries,5121,5121,5121,5121,Motion Picture and Video Industries,Motion Picture and Video Industries,Motion Picture and Video Industries,Motion Picture and Video Industries
2,1060166680,Inspired,"Inspired by a lifetime of collaboration, I am finally recording an album of my own with the help of great musicians & friends.",Music,Jazz,7111,5122,5122,5122,Performing Arts Companies,Sound Recording Industries,Sound Recording Industries,Sound Recording Industries
3,1070560167,More Business of Being Born - Ricki Lake and Abby Epstein,Filmmakers Ricki Lake and Abby Epstein take a deeper look into the controversies and choices surrounding modern childbirth.,Film & video,Documentary,5121,5121,5121,5121,Motion Picture and Video Industries,Motion Picture and Video Industries,Motion Picture and Video Industries,Motion Picture and Video Industries
4,1073620864,Rooted in Alameda,"A multimedia project lifting up the history of African Americans in Alameda, California.",Journalism,Audio,5191,5122,5122,5122,Other Information Services,Sound Recording Industries,Sound Recording Industries,Sound Recording Industries
5,1098244109,Ghost Vinyl: Insert Meme Here,"Just some kids from butthole, Arizona, looking to become a great indie band...in the eyes of an angel, we need you.",Music,Indie rock,7111,5122,5122,5122,Performing Arts Companies,Sound Recording Industries,Sound Recording Industries,Sound Recording Industries
6,1106106046,SnapPower ConnectLight,"Linkable, motion activated pathway lighting...that installs in seconds. Transform the way you light your home at night!",Technology,,3351,3359,3359,3359,Electric Lighting Equipment Manufacturing,Other Electrical Equipment and Component Manufacturing,Other Electrical Equipment and Component Manufacturing,Other Electrical Equipment and Component Manufacturing
7,1111970353,Swing Lab - Modular Porch Swing,A modular porch swing that allows the user to configure multiple combinations of seating arrangements.,Crafts,Woodworking,3371,3371,3371,3371,Household and Institutional Furniture and Kitchen Cabinet Manufacturing,Household and Institutional Furniture and Kitchen Cabinet Manufacturing,Household and Institutional Furniture and Kitchen Cabinet Manufacturing,Household and Institutional Furniture and Kitchen Cabinet Manufacturing
8,1137873695,Hugo the Happy Starfish: THE LAST BULLY,Hugo the Happy Starfish is an educational children’s book series by Suzy Liebermann featuring THE LAST BULLY and the Anti-Bully Club,Publishing,Children's books,5111,5111,5111,5111,"Newspaper, Periodical, Book, and Directory Publishers","Newspaper, Periodical, Book, and Directory Publishers","Newspaper, Periodical, Book, and Directory Publishers","Newspaper, Periodical, Book, and Directory Publishers"
9,1139821552,The Funklet,Graphic notations of twenty classic funk beats.,Music,,7111,5192,5192,5192,Performing Arts Companies,Code not found,Code not found,Code not found


# Check consistency of outputs from generative AI

In [18]:
# Fraction of matching entries: run 1 and run 2
(df_concat['genai run 1 code'] == df_concat['genai run 2 code']).sum()/len(df_concat)

1.0

In [19]:
# Fraction of matching entries: run 2 and run 3
(df_concat['genai run 2 code'] == df_concat['genai run 3 code']).sum()/len(df_concat)

1.0

In [20]:
# Fraction of matching entries: run 3 and run 1
(df_concat['genai run 3 code'] == df_concat['genai run 1 code']).sum()/len(df_concat)

1.0

# Check match between human and generative AI

In [21]:
# Fraction of matching entries: generative AI run 1 and human coder
(df_concat['genai run 1 code'] == df_concat['human coder code']).sum()/len(df_concat)

0.5310344827586206

In [22]:
# Fraction of matching entries: generative AI run 2 and human coder
(df_concat['genai run 2 code'] == df_concat['human coder code']).sum()/len(df_concat)

0.5310344827586206

In [23]:
# Fraction of matching entries: generative AI run 3 and human coder
(df_concat['genai run 3 code'] == df_concat['human coder code']).sum()/len(df_concat)

0.5310344827586206