In [53]:
import pandas as pd
import os
import json
from datetime import datetime
import re

# set the directory path
directory = "Articles"

# initialize an empty list to store the data
data = []

# loop through each directory in the Articles directory
for subdir in os.listdir(directory):
    subdir_path = os.path.join(directory, subdir)
    if os.path.isdir(subdir_path):
        # loop through each file in the directory
        for filename in os.listdir(subdir_path):
            file_path = os.path.join(subdir_path, filename)
            if os.path.isfile(file_path) and filename.endswith(".json"):
                # read the JSON file
                with open(file_path, "r") as f:
                    json_obj = json.load(f)
                # extract the relevant information from the JSON object
                _id = json_obj["_id"]
                #name of the directory is the company name
                company = subdir
                title = json_obj["title"]
                text = json_obj["text"]
                cleaned_text = re.sub(r'<p>|</p>', '', text)
                #Change to timestamp YYYY-MM-DD HH:MM:SS
                published = json_obj["published"]
                timestamp = datetime.strptime(published, '%Y-%m-%dT%H:%M:%S.%fZ')
                data.append([_id, company, title, cleaned_text, timestamp])
        
df = pd.DataFrame(data, columns=["id", "company", "title", "text", "published"])

In [54]:
df = df.sort_values(['company', 'published'])
df = df.drop_duplicates(subset=['text'], keep='first')

In [55]:
df = df.reset_index(drop=True)
del df['id']

In [56]:
df
#delete duplicates

Unnamed: 0,company,title,text,published
0,3M_Company,Investors look past the storming of US Capitol,"For cold-blooded markets, the big political ev...",2021-01-09 05:00:26.214
1,3M_Company,EXCHANGE --- Business News: Firms Team Up on V...,The agreement is part of the federal governmen...,2021-01-16 00:00:00.000
2,3M_Company,EXCHANGE --- Business News: Firms Team Up o...,Medical-gear makers are sharing production ...,2021-01-16 07:07:00.000
3,3M_Company,"Pandemic price rises still rampant on Amazon, ...",Hundreds of the essential products that have c...,2021-01-22 04:00:11.785
4,3M_Company,"Vietnam congress, Fed meeting, tech reports, D...",<em>This week brings two big gatherings with t...,2021-01-24 17:16:44.025
...,...,...,...,...
74404,Walmart_Inc,A New Year's Resolution for Stemming Retail's ...,Retailers can expect costs including customer ...,2022-12-26 12:00:00.000
74405,Walmart_Inc,A New Year's Resolution for Stemming Retail's ...,Retailers can expect costs including customer ...,2022-12-26 12:00:00.000
74406,Walmart_Inc,George Santos Loses Some GOP Support on Long I...,Nassau County Republican Chairman Joe Cairo sa...,2022-12-30 22:50:00.000
74407,Walmart_Inc,U.S. News: Republicans in Home District Seek D...,Nassau County Republican Chairman Joe Cairo sa...,2022-12-31 00:00:00.000


In [58]:
#print company names
company_to_ticker = {
    '3M_Company': 'MMM',
    'American_Express_co': 'AXP',
    'Amgen_Inc': 'AMGN',
    'Apple_Inc': 'AAPL',
    'Boeing_Co': 'BA',
    'Caterpillar_Inc': 'CAT',
    'Chevron_Corporation': 'CVX',
    'Cisco_Systems_Inc': 'CSCO',
    'Coca_Cola_Co': 'KO',
    'Dow_Inc': 'DOW',
    'Goldman_Sachs_Group_Inc': 'GS',
    'Home_Depot_Inc': 'HD',
    'Honeywell_International_Inc': 'HON',
    'Intel_Corporation': 'INTC',
    'International_Business_Machines_Corporation': 'IBM',
    'JPMorgan_Chase_Co': 'JPM',
    'Johnson_Johnson': 'JNJ',
    'McDonald_s_Corporation': 'MCD',
    'Merck_Co_Inc': 'MRK',
    'Microsoft_Corporation': 'MSFT',
    'Nike_Inc': 'NKE',
    'Procter_Gamble_Co': 'PG',
    'Salesforce_Inc': 'CRM',
    'The_Walt_Disney_Company': 'DIS',
    'Travelers_Companies_Inc': 'TRV',
    'Unitedhealth_Group_Incorporated': 'UNH',
    'Verizon_communications_Inc': 'VZ',
    'Visa_Inc': 'V',
    'Walgreens_Boots_Alliance_Inc': 'WBA',
    'Walmart_Inc': 'WMT'
}
#Replace company names with ticker symbols
df['company'] = df['company'].replace(company_to_ticker)

['3M_Company' 'American_Express_co' 'Amgen_Inc' 'Apple_Inc' 'Boeing_Co'
 'Caterpillar_Inc' 'Chevron_Corporation' 'Cisco_Systems_Inc'
 'Coca_Cola_Co' 'Dow_Inc' 'Goldman_Sachs_Group_Inc' 'Home_Depot_Inc'
 'Honeywell_International_Inc' 'Intel_Corporation'
 'International_Business_Machines_Corporation' 'JPMorgan_Chase_Co'
 'Johnson_Johnson' 'McDonald_s_Corporation' 'Merck_Co_Inc'
 'Microsoft_Corporation' 'Nike_Inc' 'Procter_Gamble_Co' 'Salesforce_Inc'
 'The_Walt_Disney_Company' 'Travelers_Companies_Inc'
 'Unitedhealth_Group_Incorporated' 'Verizon_communications_Inc' 'Visa_Inc'
 'Walgreens_Boots_Alliance_Inc' 'Walmart_Inc']


In [60]:
df_date = df.copy()
grouped = df_date.groupby('company')['published'].agg(['min', 'max'])

In [61]:
grouped

Unnamed: 0_level_0,min,max
company,Unnamed: 1_level_1,Unnamed: 2_level_1
AAPL,2021-01-01 00:03:33.567,2022-12-31 20:15:00.000
AMGN,2021-02-11 00:00:00.000,2022-12-31 00:00:00.000
AXP,2021-01-02 00:00:00.000,2022-12-29 21:40:00.000
BA,2021-01-01 21:00:23.506,2022-12-31 00:00:00.000
CAT,2021-01-06 21:26:48.288,2022-12-18 07:00:32.090
CRM,2021-01-03 05:00:19.540,2022-12-30 13:00:00.000
CSCO,2021-01-02 00:00:00.000,2022-12-17 07:01:00.000
CVX,2021-01-01 05:00:52.021,2022-12-31 11:25:00.000
DIS,2021-01-02 00:00:00.000,2022-12-31 00:00:00.000
DOW,2021-01-15 23:08:00.000,2022-12-30 00:59:00.000
