### Setup


In [1]:
# GPT library
import openai
import tiktoken
# data libraries
import time
import random
import pandas as pd
import numpy as np
# other libraries
import os
from tqdm.notebook import tqdm # progress bar
import configparser
import pprint

pd.options.mode.chained_assignment = None  # default='warn', disables a warning for later

config = configparser.ConfigParser()
config.read('api_key.ini')
openai.api_key = config.get('openai', 'tim_key')

### Data and Model loading

we want to work with the best possible input descriptions for this. Therefore we take the raw data of the 'corrupted' dataset and replace the industry labels with the cleaned ones.

In [2]:
# loading in data
df_corr = pd.read_csv("../data/corrupted/raw_data.csv")
df_ind = pd.read_csv("../data/label_correction/raw_data.csv")

# replacing industry labels in corrupted with ones from label_correction
df_corr = df_corr.drop(columns=['industry'])
df = df_corr.merge(df_ind[['startup_ID', 'industry']], 'left', 'startup_ID')

# merge descriptions
df['description'] = df['description_startupdetector'].fillna(df['startup_description'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3969 entries, 0 to 3968
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   startup_ID                   3969 non-null   int64 
 1   description_startupdetector  607 non-null    object
 2   startup_description          3832 non-null   object
 3   industry                     3969 non-null   object
 4   description                  3969 non-null   object
dtypes: int64(1), object(4)
memory usage: 155.2+ KB


### Data preprocessing

In [3]:
# generate batch lists from description column, containing 10 descriptions each
def generate_batches_for_industry(df, industry: str, batchsize: int):
    # get list of descriptions
    desc_lst = df.loc[df['industry'] == industry, 'description'].tolist()

    # batch size
    n = batchsize

    # building batches
    batches = [desc_lst[i * n:(i + 1) * n] for i in range((len(desc_lst) + n - 1) // n )]

    # print(len(batches))
    # print(len(batches[0]))

    return batches

# count how many tokens are in one request
def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

### Generating Text

In [4]:
# generating input function
def generate_input(descriptions: list, industry: str, keywords: list, excluded_ind: str) -> str:
  """generates a text prompt for chatgpt from descriptions, target industry, keywords defining the industry and sectors to exclude"""
  
  def _add_examples(examples: list):
    out = 'Examples:'
    i = 1
    for example in examples:
      out += f'\nExample {i}: {example}\n##'
      i += 1
    return out
  
  input = f'''we define the {industry} industry by these keywords:

"""{", ".join(keywords)}"""

Generate {len(descriptions * 2)} examples for startup descriptions in the {industry} industry. Descriptions should be in the same domain as the examples provided below. Examples can not be in the {excluded_ind} sector. The output should contain the descriptions in csv format.

{_add_examples(descriptions)}'''
  
  return input


descriptions = ['he customer borrows the shopping bag instead of buying it. He just pays a small fee and uses our product with full service and on demand. The customer can return the bag when it is no longer needed or dirty. The joeybags stay in the system and are used over and over again.',
                'The idea of Urban Hochbeet germinated in the summer of 2020- in the midst of the Corona pandemic. The project is a complete package that offers a raised bed, fresh plants, suitable soil, fertilizer and care instructions.']
industry = 'Sustainability and GreenTech'
keywords = ['Sustainability', 'Recycling', 'AgrarTech', 'Sharing economy', 'Water management', 'CleanTech', 'Forest economy']
excluded_ind = 'mobility, energy, supply chain and construction'


# completion = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {"role": "user", "content": generate_input(descriptions, industry, keywords, excluded_ind)}
#   ]
# )
print(generate_input(descriptions, industry, keywords, excluded_ind))

we define the Sustainability and GreenTech industry by these keywords:

"""Sustainability, Recycling, AgrarTech, Sharing economy, Water management, CleanTech, Forest economy"""

Generate 4 examples for startup descriptions in the Sustainability and GreenTech industry. Descriptions should be in the same domain as the examples provided below. Examples can not be in the mobility, energy, supply chain and construction sector. The output should contain the descriptions in csv format.

Examples:
Example 1: he customer borrows the shopping bag instead of buying it. He just pays a small fee and uses our product with full service and on demand. The customer can return the bag when it is no longer needed or dirty. The joeybags stay in the system and are used over and over again.
##
Example 2: The idea of Urban Hochbeet germinated in the summer of 2020- in the midst of the Corona pandemic. The project is a complete package that offers a raised bed, fresh plants, suitable soil, fertilizer and care

In [5]:
# test number of tokens for each batch in AR & VR
batches = generate_batches_for_industry(df, 'AR & VR', 10)

for batch in batches:
    industry = 'Sustainability & GreenTech'
    keywords = ['Sustainability', 'Recycling', 'AgrarTech', 'Sharing economy', 'Water management', 'CleanTech', 'Forest economy']
    excluded_ind = 'health, medicine, production, education, retail, mobility, construction, supply chain and finance'
    
    promt = generate_input(descriptions= batch, industry='AR & VR', keywords=keywords, excluded_ind=excluded_ind)
    
    print(num_tokens_from_string(promt, "gpt-3.5-turbo"))

619
657
664
721
402


In [14]:
industry = 'Mobility & Transportation'

batches = generate_batches_for_industry(df, industry, 5)

keywords = ['Automotive', 'Smart Mobility', 'Aviation', 'Autonomous driving', 'Micromobility', 'Charging station', 'SpaceTech', 'Rail transport', 'Mobility / Transport']
excluded_ind = 'supplychain, energy, education, retail, logistics and construction'

responses = []

for batch in tqdm(batches):
    for test in range(1,5):
        try:
            promt = generate_input(batch, industry, keywords, excluded_ind)
            response = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                                    messages=[{"role": "user", "content": promt}],
                                                    temperature=1.25,
                                                    max_tokens=1024,
                                                    top_p=1,
                                                    frequency_penalty=0,
                                                    presence_penalty=0
                                                    )
            response_text = response.choices[0].message.content
            responses.append(response_text)
            break
        
        except openai.OpenAIError as e:
            sleep_dur = 20
            print(f"Error: {e}. Retrying in {sleep_dur} seconds.")
            time.sleep(sleep_dur)
            continue

  0%|          | 0/4 [00:00<?, ?it/s]

### Saving

In [12]:
file = open(f'./generated/{industry}_responses_2.txt','w')
for response in responses:
	file.write(response + "\n")
file.close()