# Data Preprocessing

### Imports

In [1]:
import pandas as pd
import glob

### Preprocessing

In [2]:
# Load All Data Content
ds_list = glob.glob('../data/raw/*.csv')

all_ds = []
for ds in ds_list:
    all_ds.append(pd.read_csv(ds))

combine_df = pd.concat(all_ds).drop_duplicates()

In [3]:
# Load WSJ Metadata
meta_df = pd.read_csv('../data/meta/wsj_meta.csv')
meta_df['Title_small'] = meta_df['Title'].apply(lambda x: x.lower())

In [4]:
# Companies Present In Title
COMPANIES = [
    'jpm',
    'goldman',
    'mastercard',
    'stanley',
    'microsoft',
    'apple',
    'alphabet',
    'google',
    'salesforce',
    'tesla',
    'boeing',
    r'\bgm\b',
    r'\bford\b',
]

# Filter Metadata Based on Companies in Article Titles
all_filter = []
for cmp in COMPANIES:
    all_filter.append(meta_df[meta_df['Title_small'].str.contains(cmp)])

wsj_df = pd.concat(all_filter).drop_duplicates(subset='GOID')

In [5]:
len(wsj_df)

4030

In [6]:
# Merge Content and Metadata
articles_with_content = wsj_df.merge(combine_df, on='GOID')

# Get Articles that need to be Downloaded and Save to INTERMEDIATE
wsj_df[~wsj_df['GOID'].isin(articles_with_content['GOID'].values.tolist())].to_csv('../data/intermediate/articles_that_need_to_be_downloaded.csv', index=False)

# Save Articles with Metadata to INTERMEDIATE
articles_with_content[[
        'GOID',
        'Title',
        'Date',
        'Content'
]].to_csv('../data/final/artilces_with_content.csv', index=False)

# Export as Json for Mongo

In [1]:
import csv
import json


csv_file_path = '../data/final/artilces_with_content.csv'  # Update this to your CSV file path

# Open the CSV and read data into a dictionary format
data = []
with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

json_file_path = '../data/final/articles_with_content.json'  # Update this to your desired JSON file path

# Convert and save the dictionary list to a JSON file
with open(json_file_path, mode='w', encoding='utf-8') as jsonfile:
    json.dump(data, jsonfile, indent=4)  # `indent=4` for pretty printing
