# AS12
## Tianyi Zhu
#### Citation: Code structures and functions are get help from Chatgpt

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
import json
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import zipfile
import os
import re
from tqdm import tqdm

In [3]:
current_directory = os.getcwd()
print(current_directory)

/Users/zhutianyi/Desktop/OneDrive - Georgia Institute of Technology/24 Fall/AI/AS12


In [4]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x16aa8ab90>

In [5]:
company_file = "sample_company.csv"
description_zip = "business_description_10K.zip"
msf_file = "MSF_1996_2023.csv"

# Step 1
## Data Processing

In [None]:
import shutil
import os

# Define the path of the zip file and the extraction directory
zip_file = "business_description_10K.zip"
extraction_path = "business_description_10K"

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_path, exist_ok=True)

try:
    shutil.unpack_archive(zip_file, extraction_path)
    print("Extraction successful!")

    # List the first few files to verify extraction
    extracted_files = os.listdir(extraction_path)
    print("Extracted Files:", extracted_files[:5])
except Exception as e:
    print("Extraction failed:", e)

Extraction successful!
Extracted Files: ['10K_item1_PERMNO']


In [6]:
# Load sample_company.csv
sample_df = pd.read_csv(company_file)
print("Sample Company Data:")
print(sample_df.head())

# Load MSF data
msf_df = pd.read_csv(msf_file)
print("MSF Data:")
print(msf_df.head())

Sample Company Data:
   label  year                                             PERMNO
0      0  1996  [11614, 76862, 77056, 82225, 20598, 16468, 762...
1      0  1997  [12006, 67619, 54981, 16468, 84422, 85213, 116...
2      0  1998  [20598, 76950, 77730, 76264, 85982, 84422, 117...
3      0  1999  [77730, 85037, 85982, 85951, 77099, 51190, 839...
4      0  2000  [85213, 85982, 87021, 12282, 77099, 11614, 164...
MSF Data:
   PERMNO        date  SHRCD SICCD TICKER           COMNAM  PERMCO     CUSIP  \
0   10001  1996-01-31     11  4920   EWST  ENERGY WEST INC    7953  36720410   
1   10001  1996-02-29     11  4920   EWST  ENERGY WEST INC    7953  36720410   
2   10001  1996-03-29     11  4920   EWST  ENERGY WEST INC    7953  36720410   
3   10001  1996-04-30     11  4920   EWST  ENERGY WEST INC    7953  36720410   
4   10001  1996-05-31     11  4920   EWST  ENERGY WEST INC    7953  36720410   

   BIDLO  ASKHI      PRC    VOL        RET    BID   ASK  SHROUT       RETX  \
0  8.750  9.50

In [None]:
# Initialize an empty list to store data
business_data = []

json_dir = "business_description_10K/10K_item1_PERMNO"

# Loop through each JSON file in the directory
for file_name in tqdm(os.listdir(json_dir), desc="Reading JSON files"):
    if file_name.endswith('.json'):
        file_path = os.path.join(json_dir, file_name)
        
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                # Extract the relevant information
                permno = data.get("PERMNO", None)
                filing_date = data.get("filing_date", None)
                description = data.get("item_1", "") 
                
                # Append the data to the list
                business_data.append({
                    "PERMNO": permno,
                    "filing_date": filing_date,
                    "description": description
                })
            except json.JSONDecodeError:
                print(f"Error reading {file_name}. Skipping.")

# Convert the list of dictionaries to a DataFrame
business_df = pd.DataFrame(business_data)

Reading JSON files: 100%|██████████| 110294/110294 [00:39<00:00, 2795.58it/s]


In [11]:
# Function to clean the business description text
def clean_description(text):
    if not isinstance(text, str):
        return ""
    
    # Remove the "ITEM 1. BUSINESS" header (case insensitive)
    text = re.sub(r'item\s*1[\.:]?\s*business', '', text, flags=re.IGNORECASE)
    
    # Remove extra new lines and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply the cleaning function to the description column
business_df['description'] = business_df['description'].apply(clean_description)

# Display the cleaned DataFrame
print("Cleaned Business Data:")
print(business_df.head())

Cleaned Business Data:
   PERMNO filing_date                                        description
0   89526  2003-03-28  A. General Development of Business First Natio...
1   82587  2020-02-12  Overview Intevac’s business consists of two re...
2   12380  2009-09-24  Summary of Business Park City Group, Inc. (the...
3   90024  2006-04-14  . General We are a national, mall-based, speci...
4   76732  2021-03-02  Dine Brands Global, Inc.SM, together with its ...


In [12]:
business_df

Unnamed: 0,PERMNO,filing_date,description
0,89526,2003-03-28,A. General Development of Business First Natio...
1,82587,2020-02-12,Overview Intevac’s business consists of two re...
2,12380,2009-09-24,"Summary of Business Park City Group, Inc. (the..."
3,90024,2006-04-14,". General We are a national, mall-based, speci..."
4,76732,2021-03-02,"Dine Brands Global, Inc.SM, together with its ..."
...,...,...,...
110289,87236,2000-03-22,. OVERVIEW InterNAP Network Services Corporati...
110290,84331,2014-03-18,BUSINESS GENERAL River Valley Bancorp (the “Ho...
110291,77652,1996-09-17,". The Rival Company, the registrant, together ..."
110292,86248,2008-09-29,"Overview In this Annual Report on Form 10-K, w..."


In [13]:
business_df.to_csv('business_des.csv')

In [15]:
business_df['year'] = pd.to_datetime(business_df['filing_date']).dt.year

In [28]:
sample_company_path = 'sample_company.csv'
sample_df = pd.read_csv(sample_company_path)
sample_df

Unnamed: 0,label,year,PERMNO
0,0,1996,"[11614, 76862, 77056, 82225, 20598, 16468, 762..."
1,0,1997,"[12006, 67619, 54981, 16468, 84422, 85213, 116..."
2,0,1998,"[20598, 76950, 77730, 76264, 85982, 84422, 117..."
3,0,1999,"[77730, 85037, 85982, 85951, 77099, 51190, 839..."
4,0,2000,"[85213, 85982, 87021, 12282, 77099, 11614, 164..."
...,...,...,...
275,9,2019,"[14675, 16181, 14312, 19024, 15902, 14792, 187..."
276,9,2020,"[17868, 17034, 15272, 17958, 17122, 16308, 137..."
277,9,2021,"[20344, 14422, 21708, 20614, 15417, 16974, 155..."
278,9,2022,"[20640, 18205, 14451, 20341, 21842, 19135, 184..."


In [None]:
import ast

sample_df['PERMNO'] = sample_df['PERMNO'].apply(ast.literal_eval)

sample_df = sample_df.explode('PERMNO')
sample_df['PERMNO'] = pd.to_numeric(sample_df['PERMNO'], errors='coerce')

sample_df.dropna(subset=['PERMNO', 'year'], inplace=True)
sample_df['PERMNO'] = sample_df['PERMNO'].astype(int)
sample_df['year'] = sample_df['year'].astype(int)

In [31]:
merged_df = pd.merge(business_df, sample_df[['PERMNO', 'year', 'label']], on=['PERMNO', 'year'], how='inner')
merged_df

Unnamed: 0,PERMNO,filing_date,description,year,label
0,75517,2003-03-28,General Development of Business Marten Transpo...,2003,4
1,79824,2021-02-22,"General First Financial Bankshares, Inc., a Te...",2021,7
2,82746,1999-03-25,Overview Synaptic Pharmaceutical Corporation (...,1999,9
3,33420,1999-03-29,GENERAL DEVELOPMENT OF BUSINESS Equity Oil Com...,1999,1
4,76950,2014-02-26,"Company Background We design, market, and dist...",2014,0
...,...,...,...,...,...
1936,11955,1998-03-31,". GENERAL USA Waste Services, Inc. (""USA Waste...",1998,9
1937,20598,2014-07-28,"Our Business Cal-Maine Foods, Inc. (“we,” “us,...",2014,0
1938,78963,2015-03-09,"Overview Chico’s FAS, Inc.1, is a cultivator o...",2015,6
1939,90238,2013-03-15,Overview We are a leading company using propri...,2013,8


## Tokenize business description

In [None]:
from transformers import AutoTokenizer

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

# Define tokenizer settings
tokenizer_settings = {
    'truncation': True,
    'padding': 'max_length',
    'max_length': 512,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'return_tensors': 'pt'
}

# Tokenize the business descriptions
descriptions = merged_df['description'].tolist()
tokenized_data = tokenizer(descriptions, **tokenizer_settings)

# Display the tokenized output example
print("Tokenized data keys:", tokenized_data.keys())
print("Example input_ids:", tokenized_data['input_ids'][:1])
print("Example attention_mask:", tokenized_data['attention_mask'][:1])

Tokenized data keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Example input_ids: tensor([[  101,  2236,  2458,  1997,  2449, 20481,  2368,  3665,  1010,  5183,
          1012,  2003,  1037,  2146,  1011, 14655,  4744, 11066,  6839,  4346,
          9474,  2326,  5193,  1998,  2051,  1011,  7591,  5193,  1012,  1523,
          9474,  2326,  5193,  1524,  2965,  4860,  4758,  2030, 16021,  8898,
          9118,  1997,  4860,  1011,  7591,  4475,  1998,  2236, 21955,  1012,
          2057,  2031,  4082,  3691,  1010,  2119,  3206,  1998,  2691,  1010,
          4379,  2011,  1996,  7553,  6236,  3222,  1010,  2030, 16461,  1010,
          1998,  2024,  2747, 12222,  2011,  1996,  2142,  2163,  2533,  1997,
          5193,  1010,  2030, 11089,  1010,  1998,  1996,  2976,  3307,  3447,
          1010,  2030,  1042, 18663,  1012,  2004,  1997,  2285,  2861,  1010,
          2526,  1010,  2057,  3498,  1037,  4170,  1997,  1016,  1010,  5718,
          2620, 28292,  1998, 

# Split dataset into train, validation and test

In [33]:
from sklearn.model_selection import train_test_split

# Split the merged DataFrame into train, validation, and test sets (80%, 10%, 10%)
train_df, temp_df = train_test_split(merged_df, test_size=0.2, random_state=42, stratify=merged_df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Display the sizes of each split
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 1552
Validation set size: 194
Test set size: 195


# Step 2: Fine-tunning BERT

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=10, 
    output_hidden_states=True  
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
