### Construct DataFrame

In [1]:
import pandas as pd
import json

def dataframe_construct(input_file_path, output_file_path):
    # Read JSON file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        # Load JSON data
        data = json.load(file)
        
    # Initialize lists to store story_id and body_text
    story_ids = []
    body_texts = []

    # Process each story in the JSON data
    for idx, story in enumerate(data):
        # Extract story_id and body_text
        story_id = idx + 1  # Assuming story_id starts from 1
        body_text = story.strip('"')  # Remove leading/trailing quotes
        
        # Append to lists
        story_ids.append(story_id)
        body_texts.append(body_text)

    # Create DataFrame
    df = pd.DataFrame({'Story_ID': story_ids, 'Body_text': body_texts})

    # Save DataFrame to CSV with headers and without index column
    df.to_csv(output_file_path, index=False, header=True)
    
    # Read CSV file
    df_csv = pd.read_csv(output_file_path)

    # Print first 10 words of each body_text
    print("Printing first 10 words of each story:")
    for index, row in df_csv.iterrows():
        first_10_words = ' '.join(row['Body_text'].split()[:10])
        print(f"Story {row['Story_ID']}, {first_10_words}...")
        print()  # Separate each story

# Example usage
input_file_path = 'tokenizedFine_500.json'
output_file_path = 'df_construct_500.csv'
dataframe_construct(input_file_path, output_file_path)


Printing first 10 words of each story:
Story 1, bengaluru infosys solid foundation grow back large deal bagged fiscal...

Story 2, december staff american writer artist institute membership organization copywriter realized...

Story 3, bengaluru genz prioritizes professional growth competitive pay workplace consideration continuing...

Story 4, new technology upend many online business company figure work boon...

Story 5, share u tech giant buzzing fund scorching performance chart mirae...

Story 6, every tuesday friday ezra klein invite conversation something matter today...

Story 7, door shut andrew return mark andrew raven star tight end...

Story 8, doha mekki one president biden key antitrust enforcer talk justice...

Story 9, general election campaign begin earnest expect disinformation attack target voter...

Story 10, reddit social medium news aggregation platform million active daily user...

Story 11, tech giant partner openai accused infringing copyright train technology o

### Construct the Variables

In [7]:
import pandas as pd
import re

def count_occurrences(text, phrase):
    # Count occurrences of the phrase in the text (case insensitive)
    return len(re.findall(f'\\b{phrase}\\b', text, flags=re.IGNORECASE))

def variable_construct(input_file_path, output_file_path, phrases):
    # Read original CSV file
    df = pd.read_csv(input_file_path)
    
    # Initialize empty lists to store counts
    counts_risk = []
    counts_privacy = []
    counts_job = []
    counts_wage  = []
    
    # Process each Body_text to count occurrences of the phrases
    for body_text in df['Body_text']:
        count_risk = count_occurrences(body_text, phrases['risk'])
        count_privacy = count_occurrences(body_text, phrases['privacy'])
        count_job = count_occurrences(body_text, phrases['job'])
        count_wage = count_occurrences(body_text, phrases['wage'])
        
        counts_risk.append(count_risk)
        counts_privacy.append(count_privacy)
        counts_job.append(count_job)
        counts_wage.append(count_wage)
    
    # Add new columns to DataFrame
    df['risk'] = counts_risk
    df['privacy'] = counts_privacy
    df['job'] = counts_job
    df['wage'] = count_wage
    
    # Drop 'Body_text' column from DataFrame
    df.drop(columns=['Body_text'], inplace=True)
    
    # Save DataFrame to new CSV file
    df.to_csv(output_file_path, index=False)

# Example usage
input_file_path  = 'df_construct_300.csv'
output_file_path = 'df_variables_300.csv'

# Define phrases to count
phrases = {
    'risk': 'risk',
    'privacy': 'privacy',
    'job': 'job',
    'wage': 'wage'
}

variable_construct(input_file_path, output_file_path, phrases)
