# ADSP 32018: Final Project
## Synthetic Training Data Creation

Peyton Nash

### Setup

In [5]:
# Import libraries
import re, math, gc, itertools, warnings, os, random
from dotenv import load_dotenv

import numpy as np
import pandas as pd

from collections import Counter
import openai

In [6]:
load_dotenv()

True

In [None]:
# Load the data
df = pd.read_parquet('output_data2/df_industry2.parquet').reset_index()

### Create Training Data

In [10]:
# Prepare text to be labeled
random.seed(42)
df_to_sample = df[df.industry != None].sample(2000)
#to_label = [{'idx': idx, 'text': text, 'industry':industry} for idx, text, industry in zip(idx_to_label, df[['text_clean', 'industry']][idx_to_label])]
to_label = [{'idx': idx, 'text': row['text_clean'], 'industry':row['industry']} for idx , row in df_to_sample.iterrows()]

print(f'Number of documents to train: {len(to_label)}')
print(f'\nNumber of training documents by industry:\n\n{Counter([item["industry"] for item in to_label])}')

Number of documents to train: 2000

Number of training documents by industry:

Counter({None: 1434, 'Government': 108, 'Finance and insurance': 94, 'Arts, entertainment, and recreation': 84, 'Professional, scientific, and technical services': 53, 'Software and data': 50, 'Retail trade': 44, 'Health care and social assistance': 36, 'Educational services': 23, 'Transportation and warehousing': 23, 'Information (media, telecom, publishing)': 21, 'Accommodation and food services': 9, 'Utilities': 6, 'Management of companies and enterprises': 5, 'Agriculture, forestry, fishing, and hunting': 4, 'Wholesale trade': 3, 'Manufacturing': 3})


In [11]:
# Label data using GPT 3.5
openai.api_key = os.environ['OPENAI_API_KEY']
client = openai.OpenAI()
gpt_model = 'gpt-3.5-turbo'

# Create function to label articles
def label_text(text: str):
    prompt = (
        f'''You are a sentiment annotator. Your task is to label a news article based on the overall sentiment it expresses toward the implementation of artificial intelligence. 
Focus only on the sentiment expressed about AI implementation.
Do not label based on unrelated topics in the article.

POSITIVE: The article suggests that AI adoption is likely to be successful
NEGATIVE: The article suggests that AI adoption is unlikely to be successful
NEUTRAL: The article has neither a strong positive or negative sentiment.

Return only one of the following labels: POSITIVE, NEGATIVE, or NEUTRAL.

Text to classify:
'''
        )
   
    response = client.chat.completions.create(
        model=gpt_model,
        messages=[
            {"role":"system", "content":prompt},
            {"role": "user", "content": text}],
        temperature=1,
        max_completion_tokens = 5
    )

    return response.choices[0].message.content

In [None]:
# Create counters
n_pos = 0
n_neg = 0
n_net = 0
n_format = 0
n_label = 0

# Create empty containers
labels = {}
misformat = []

for i, obs in enumerate(to_label):
    if i % 50 == 0:
        print(f'{i} observations completed')
        print(n_pos, ', ', n_neg, ', ', n_net)

    label = re.sub(r'[^A-Z]', '', label_text(obs['text']).upper().strip())

    # Add to label dictionary if the output is formatted correctly
    if re.search(r'POSITIVE|NEGATIVE|NEUTRAL', label):

        labels[obs['idx']] = label
        n_label += 1

        # Add to class counters
        if label == 'POSITIVE':
            n_pos += 1
        if label == 'NEGATIVE':
            n_neg += 1
        if label == 'NEUTRAL':
            n_net += 1

        # End if there are more than 150 observations in each class
        if n_net > 200 and n_neg > 200 and n_net > 200:
            print('Counts met')
            break
    
    # Add id to list of mislabelled
    else:
        n_format += 1
        misformat.append(obs['idx'])

    # End if more than 10% of observations are not returning the correct format
    if n_label > 20 and n_format/n_label > .1:
        print('Too many misformats')
        break

0 observations completed
638 ,  144 ,  408
50 observations completed
662 ,  152 ,  426
100 observations completed
685 ,  163 ,  442
150 observations completed
719 ,  170 ,  451
200 observations completed
750 ,  177 ,  463
250 observations completed
779 ,  183 ,  478
300 observations completed
805 ,  187 ,  498
350 observations completed
830 ,  193 ,  517
400 observations completed
856 ,  195 ,  538


In [22]:
# Create labeled list
labeled = [{**item, 'sentiment':labels.get(item['idx'], None)} for item in to_label]

In [24]:
pd.DataFrame(labeled).to_parquet('output_data2/labeled2.parquet')