# ADSP 32018: Final Project
## Synthetic Training Data Creation

Peyton Nash

### Setup

In [5]:
# Import libraries
import re, math, gc, itertools, warnings, os, random
from dotenv import load_dotenv

import numpy as np
import pandas as pd

from collections import Counter
import openai

In [37]:
load_dotenv()

True

In [8]:
# Load the data
df = pd.read_parquet('output_data/df_industry.parquet').reset_index()

### Create Training Data

In [26]:
# Prepare text to be labeled
random.seed(42)
df_to_sample = df[df.industry != None].sample(1000)
#to_label = [{'idx': idx, 'text': text, 'industry':industry} for idx, text, industry in zip(idx_to_label, df[['text_clean', 'industry']][idx_to_label])]
to_label = [{'idx': idx, 'text': row['text_clean'], 'industry':row['industry']} for idx , row in df_to_sample.iterrows()]

print(f'Number of documents to train: {len(to_label)}')
print(f'\nNumber of training documents by industry:\n\n{Counter([item["industry"] for item in to_label])}')

Number of documents to train: 1000

Number of training documents by industry:

Counter({None: 690, 'Arts, entertainment, and recreation': 60, 'Finance and insurance': 49, 'Software and data': 49, 'Health care and social assistance': 34, 'Government': 29, 'Professional, scientific, and technical services': 15, 'Transportation and warehousing': 14, 'Information (media, telecom, publishing)': 13, 'Retail trade': 12, 'Educational services': 11, 'Accommodation and food services': 6, 'Retail Trade': 5, 'Utilities': 4, 'Real estate and rental and leasing': 3, 'Wholesale trade': 3, 'Management of companies and enterprises': 2, 'Agriculture, forestry, fishing, and hunting': 1})


In [59]:
# Label data using GPT 3.5
openai.api_key = os.environ['OPENAI_API_KEY']
client = openai.OpenAI()
gpt_model = 'gpt-3.5-turbo'

# Create function to label articles
def label_text(text: str):
    prompt = (
        f'''You are a sentiment annotator. Your task is to label a news article based on the overall sentiment it expresses toward the implementation of artificial intelligence. 
Focus only on the sentiment expressed about AI implementation.
Do not label based on unrelated topics in the article.

POSITIVE: The article suggests that AI adoption is likely to be successful
NEGATIVE: The article suggests that AI adoption is unlikely to be successful
NEUTRAL: The article has neither a strong positive or negative sentiment.

Return only one of the following labels: POSITIVE, NEGATIVE, or NEUTRAL.

Text to classify:
'''
        )
   
    response = client.chat.completions.create(
        model=gpt_model,
        messages=[
            {"role":"system", "content":prompt},
            {"role": "user", "content": text}],
        temperature=1,
        max_completion_tokens = 5
    )

    return response.choices[0].message.content

In [None]:
# Create counters
n_pos = 0
n_neg = 0
n_net = 0
n_format = 0
n_label = 0

# Create empty containers
labels = {}
misformat = []

for i, obs in enumerate(to_label[803:]):
    if i % 100 == 0:
        print(f'{i} observations completed')

    label = re.sub(r'[^A-Z]', '', label_text(obs['text']).upper().strip())

    # Add to label dictionary if the output is formatted correctly
    if re.search(r'POSITIVE|NEGATIVE|NEUTRAL', label):

        labels[obs['idx']] = label
        n_label += 1

        # Add to class counters
        if label == 'POSITIVE':
            n_pos += 1
        if label == 'NEGATIVE':
            n_neg += 1
        if label == 'NEUTRAL':
            n_net += 1

        # End if there are more than 150 observations in each class
        if n_net > 150 & n_neg > 150 & n_net > 150:
            break
    
    # Add id to list of mislabelled
    else:
        n_format = 0
        misformat.append(obs['idx'])

    # End if more than 10% of observations are not returning the correct format
    if n_label > 20 and n_format/n_label > .1:
        print('Too many misformats')
        break

In [None]:
# Create labeled list
labeled = [{**item, 'sentiment':labels.get(item['idx'], None)} for item in to_label]

In [93]:
pd.DataFrame(labeled).to_parquet('output_data/labeled.parquet')