In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import yfinance as yf
from tqdm import tqdm

In [3]:
path = 'Input'
data = pd.read_csv(f'{path}/Company Classification Original.csv', index_col=0)
data.columns = ['Code', 'Name', 'Index']
data['Code'] = data['Code'].astype(str)
data.head()

Unnamed: 0,Code,Name,Index
0,1101,台泥,TEJN01 傳產- 水泥
1,1102,亞泥,TEJN01 傳產- 水泥
2,1103,嘉泥,TEJN01 傳產- 水泥
3,1104,環泥,TEJN01 傳產- 水泥
4,1108,幸福,TEJN01 傳產- 水泥


In [4]:
split_data = data.iloc[:, 2].str.strip().str.split(r'\s+', expand=True)
data['Index'] = split_data[0]
data['Sector1'] = split_data[1].str.split('-', expand=True)[0]
data['Sector2'] = split_data[2]
data.drop(columns=[data.columns[2]], inplace=True)
data.head()

Unnamed: 0,Code,Name,Sector1,Sector2
0,1101,台泥,傳產,水泥
1,1102,亞泥,傳產,水泥
2,1103,嘉泥,傳產,水泥
3,1104,環泥,傳產,水泥
4,1108,幸福,傳產,水泥


In [4]:
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    code = row['Code'] + '.TW'
    ticker = yf.Ticker(code)
    try:
        sector = ticker.info['sector']
    except KeyError:
        sector = 'Unknown'
    except Exception as e:
        print(f"Error processing {code}: {e}")
        sector = 'Unknown'
    data.at[index, 'Sector'] = sector

  0%|          | 0/997 [00:00<?, ?it/s]

100%|██████████| 997/997 [02:59<00:00,  5.55it/s]


In [23]:
delist = pd.read_csv(f'{path}/Company Delisting.csv', index_col='Date', parse_dates=True)
delist['Code'] = delist['Code'].astype(str)
delist.head()

Unnamed: 0_level_0,Company,Code,Sector
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-12-15,亞太電,3682,Communication Services
2023-06-26,誠創,3536,Technology
2022-12-21,泰昇-KY,8480,Financial Services
2022-09-26,新世紀,3383,Technology
2022-09-26,華上,6289,Technology


In [32]:
merge = pd.merge(data, delist, on='Code', how='outer')
df = pd.DataFrame({'Code': merge['Code'], 'Sector': merge['Sector_x'].fillna(merge['Sector_y']), 'Name': merge['Name'].fillna(merge['Company'])})
df['Sector'] = df['Sector'].replace({'Technology': 'Electronic', 'Financial Services': 'Financial'})
df.to_csv(f'{path}/Company Classification.csv', index=False)
df.head()

Unnamed: 0,Code,Sector,Name
0,1101,Basic Materials,台泥
1,1102,Basic Materials,亞泥
2,1103,Basic Materials,嘉泥
3,1104,Basic Materials,環泥
4,1108,Basic Materials,幸福


In [37]:
df['Sector'].value_counts()

Sector
Electronic                462
Industrials               203
Consumer Cyclical         177
Basic Materials           105
Financial                  60
Healthcare                 46
Consumer Defensive         43
Real Estate                42
Communication Services     11
Utilities                   7
Energy                      6
Name: count, dtype: int64