In [1]:
#SDGS별 E/S/G분류하여 PARAMETER 조정후 정규분포하에서 생성한 샘플
#현재는 사용X

import numpy as np
import pandas as pd

# ---------- 설정 ----------
n_users = 2000
sigma = 0.25           
rng = np.random.default_rng(42)

# SDG를 E/S/G 그룹으로 나눈 인덱스 (0-based)
# E: 6,7,11,12,13,14,15
E = [5, 6, 10, 11, 12, 13, 14]
# S: 1,2,3,4,5,8,10,17
S = [0, 1, 2, 3, 4, 7, 9, 16]
# G: 9,16
G = [8, 15]

# ---------- 1. 사용자 유형 분포 (E-heavy, S-heavy, Balanced, G-heavy) ----------
user_types = rng.choice(
    ["E", "S", "B", "G"],
    size=n_users,
    p=[0.5, 0.3, 0.15, 0.05]
)

# ---------- 2. full SDG 선호도(0~1) 생성 ----------
prefs = np.zeros((n_users, 17))

for i, t in enumerate(user_types):
    # 그룹별 평균 설정
    if t == "E":
        mu = {"E": 0.8,  "S": 0.6,  "G": 0.4}
    elif t == "S":
        mu = {"E": 0.6,  "S": 0.8,  "G": 0.4}
    elif t == "B":
        mu = {"E": 0.65, "S": 0.65, "G": 0.65}
    else:  # G-heavy
        mu = {"E": 0.5,  "S": 0.5, "G": 0.8}

    row = np.zeros(17)

    # 각 그룹별 정규분포에서 샘플링
    row[E] = rng.normal(mu["E"], sigma, len(E))
    row[S] = rng.normal(mu["S"], sigma, len(S))
    row[G] = rng.normal(mu["G"], sigma, len(G))

    # [0,1]로 클리핑
    row = np.clip(row, 0, 1)

    # 1~5 Likert로 매핑 후 다시 0~1로 변환
    #   raw ∈ [0,0.125] → Likert 1 → 최종 0
    row = np.round(row * 4 + 1)   # 1~5 점수
    row = (row - 1) / 4           # 0,0.25,0.5,0.75,1.0

    prefs[i] = row

# ---------- 3. 위험 성향 태그 생성 (0/1/2 = 위험/중립/안정 예시) ----------
risk = rng.choice(
    [0, 1, 2],
    size=n_users,
    p=[0.3, 0.4, 0.3]  # 위험형 30%, 중립 40%, 안정형 30%
)

# ---------- 4. DataFrame 생성 ----------
columns = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
df_users_1 = pd.DataFrame(
    np.column_stack([prefs, risk]),
    columns=columns
)

# raw명 = user1 ~ user2000
df_users_1.index = [f"user{i}" for i in range(1, n_users + 1)]

print(df_users_1.head())


       sdg 1  sdg 2  sdg 3  sdg 4  sdg 5  sdg 6  sdg 7  sdg 8  sdg 9  sdg 10  \
user1   0.75   0.75   0.50   1.00   0.75   0.25   0.75   0.75   0.25    0.75   
user2   0.75   0.75   0.50   0.50   0.50   0.75   1.00   0.50   0.25    0.50   
user3   0.75   0.75   0.50   1.00   0.50   0.25   0.75   0.50   0.25    0.75   
user4   0.75   1.00   0.75   0.75   0.75   0.50   0.25   1.00   0.25    1.00   
user5   1.00   0.50   0.75   0.50   0.75   0.75   0.50   0.75   0.25    0.50   

       sdg 11  sdg 12  sdg 13  sdg 14  sdg 15  sdg 16  sdg 17  risk tag  
user1    0.25    0.50    0.50    0.75    0.50    0.25    1.00       0.0  
user2    0.50    1.00    0.25    0.75    1.00    0.25    0.75       1.0  
user3    0.75    0.75    1.00    0.75    0.75    1.00    0.75       1.0  
user4    1.00    0.75    0.25    1.00    0.75    0.50    0.75       1.0  
user5    0.50    1.00    0.25    0.75    0.75    0.50    0.75       1.0  


In [2]:
#ESG 고려없이 순수 균등분포에서 생성한 샘플
#현재 사용중

import numpy as np
import pandas as pd

n_users = 2000
rng = np.random.default_rng(42)

# 1) Likert scale candidates (0~1로 변환된 값)
likert_values = np.array([0.00, 0.25, 0.50, 0.75, 1.00])

# 2) 사용자별 SDG17개를 uniform random으로 선택
prefs = rng.choice(likert_values, size=(n_users, 17))

# 3) risk tag도 uniform하게 선택 (0,1,2)
risk = rng.choice([0, 1, 2], size=n_users, p=[1/3, 1/3, 1/3])

# 4) DataFrame 생성
columns = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
df_users = pd.DataFrame(
    np.column_stack([prefs, risk]),
    columns=columns
)

df_users.index = [f"user{i}" for i in range(1, n_users+1)]

df_users.head()


Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,sdg 11,sdg 12,sdg 13,sdg 14,sdg 15,sdg 16,sdg 17,risk tag
user1,0.0,0.75,0.75,0.5,0.5,1.0,0.0,0.75,0.25,0.0,0.5,1.0,0.75,0.75,0.75,0.75,0.5,1.0
user2,0.0,1.0,0.5,0.5,0.25,0.0,1.0,0.75,0.75,0.5,1.0,0.5,0.5,0.5,0.25,0.0,0.5,0.0
user3,1.0,0.0,1.0,1.0,0.25,0.75,0.0,0.75,0.75,0.25,0.0,1.0,0.5,1.0,0.75,0.75,0.75,0.0
user4,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.75,0.75,1.0,0.75,0.25,1.0,0.5,0.25,1.0,0.25,2.0
user5,0.0,0.5,0.75,0.0,0.5,0.0,0.75,0.5,0.25,0.25,0.5,0.75,1.0,0.5,0.0,1.0,0.75,1.0


In [3]:
cor = pd.read_csv("C:/Users/tkdgj/251113_dataset_with_tag.csv")

df_company = cor.copy()

# 1) SDG 17개 score 생성
for i in range(1, 18):
    mention_col = f"G{i:02d}_mentions_per_1k_tokens"
    sent_col = f"G{i:02d}_sent_mean"
    new_col = f"SDG{i}_score"
    
    df_company[new_col] = df_company[mention_col] * df_company[sent_col]

# 2) 위험 태그 숫자형 변환
risk_map = {"안전": 2, "중립": 1, "위험": 0}

df_company["Risk_Tag"] = (
    df_company["Risk_Tag"].astype(str).str.strip().map(risk_map)
)

In [4]:
sdg_cols = [f"SDG{i}_score" for i in range(1, 18)]
keep_cols = ['company_name'] + sdg_cols + ["Risk_Tag"]
keep_cols2 = ['company_name'] + ['corp_code'] + sdg_cols + ["Risk_Tag"]
df_company_clean = df_company[keep_cols].copy()
df_company_clean2 = df_company[keep_cols2].copy()
df_company_clean2['corp_code'] = df_company_clean2['corp_code'].astype(str).str.zfill(6)

In [5]:
corp_indust=pd.read_csv("C:/Users/tkdgj/Downloads/상장법인목록.csv",encoding = 'cp949')
corp_indust['corp_code']=corp_indust['종목코드'].astype(str).str.zfill(6)
corp_indust = corp_indust[['corp_code','업종']]
industry = df_company_clean2.merge(corp_indust,on='corp_code',how ='left')

In [6]:
sector6_map = {
    # 제조업
    "기타 식품 제조업":"제조업","의약품 제조업":"제조업","합성고무 및 플라스틱 물질 제조업":"제조업",
    "화학섬유 제조업":"제조업","일차전지 및 이차전지 제조업":"제조업","1차 철강 제조업":"제조업",
    "알코올음료 제조업":"제조업","기타 화학제품 제조업":"제조업","고무제품 제조업":"제조업",
    "봉제의복 제조업":"제조업","일반 목적용 기계 제조업":"제조업","시멘트, 석회, 플라스터 및 그 제품 제조업":"제조업",
    "기초 화학물질 제조업":"제조업","전자부품 제조업":"제조업","항공기,우주선 및 부품 제조업":"제조업",
    "철도장비 제조업":"제조업","자동차 신품 부품 제조업":"제조업","플라스틱제품 제조업":"제조업",
    "특수 목적용 기계 제조업":"제조업","전동기, 발전기 및 전기 변환 · 공급 · 제어 장치 제조업":"제조업",
    "펄프, 종이 및 판지 제조업":"제조업","기초 의약물질 제조업":"제조업","통신 및 방송 장비 제조업":"제조업",
    "선박 및 보트 건조업":"제조업","담배 제조업":"제조업","반도체 제조업":"제조업","1차 비철금속 제조업":"제조업",
    

    # 금융업
    "기타 금융업":"금융업","금융 지원 서비스업":"금융업","은행 및 저축기관":"금융업","보험업":"금융업",

    # 정보통신
    "소프트웨어 개발 및 공급업":"정보통신","컴퓨터 프로그래밍, 시스템 통합 및 관리업":"정보통신",
    "자료처리, 호스팅, 포털 및 기타 인터넷 정보매개 서비스업":"정보통신","전기 통신업":"정보통신",
    "자연과학 및 공학 연구개발업":"정보통신",

    # 서비스·유통
    "종합 소매업":"서비스·유통","무점포 소매업":"서비스·유통","가전제품 및 정보통신장비 소매업":"서비스·유통",
    "상품 종합 도매업":"서비스·유통","기타 전문 도매업":"서비스·유통","생활용품 도매업":"서비스·유통",
    "여행사 및 기타 여행보조 서비스업":"서비스·유통","항공 여객 운송업":"서비스·유통",
    "기타 운송관련 서비스업":"서비스·유통","운송장비 임대업":"서비스·유통",

    # 건설·부동산
    "토목 건설업":"건설·부동산","건물 건설업":"건설·부동산",
    "건축기술, 엔지니어링 및 관련 기술 서비스업":"건설·부동산",
    "전기 및 통신 공사업":"건설·부동산","부동산 임대 및 공급업":"건설·부동산",

    # 에너지·유틸리티
    "연료용 가스 제조 및 배관공급업":"에너지·유틸리티","석유 정제품 제조업":"에너지·유틸리티","전기업":"에너지·유틸리티"
}

industry['sector6'] = industry['업종'].map(sector6_map).fillna("기타/미분류")
industry['sector6'].value_counts()

sector6
제조업         70
금융업         32
서비스·유통      15
정보통신        11
건설·부동산       7
에너지·유틸리티     3
기타/미분류       3
Name: count, dtype: int64

In [7]:
industry.loc[industry['company_name']=='고려아연', 'sector6'] = '제조업'
industry.loc[industry['company_name']=='지역난방공사', 'sector6'] = '에너지·유틸리티'
industry.loc[industry['company_name']=='롯데웰푸드', 'sector6'] = '서비스·유통'

industry['sector6'].value_counts()


sector6
제조업         71
금융업         32
서비스·유통      16
정보통신        11
건설·부동산       7
에너지·유틸리티     4
Name: count, dtype: int64

In [8]:
def merge_infra(x):
    if x in ["건설·부동산", "에너지·유틸리티"]:
        return "인프라"
    else:
        return x

industry["sector5"] = industry["sector6"].apply(merge_infra)
industry["sector5"].value_counts()


sector5
제조업       71
금융업       32
서비스·유통    16
인프라       11
정보통신      11
Name: count, dtype: int64

In [9]:
industry = industry.set_index("company_name")
industry = industry.drop(columns=["업종", "sector6"])
sdg_cols = [f"SDG{i}_score" for i in range(1, 18)]
import numpy as np

# sector5 기준으로 그룹별 z-score 계산
z_df = industry.groupby("sector5")[sdg_cols].transform(
    lambda x: (x - x.mean()) / (x.std(ddof=0) + 1e-8)   # ddof=0: 모분산 기준
)

# 원본에 붙이기: SDG1_z, ..., SDG17_z
for col in sdg_cols:
    industry[col + "_z"] = z_df[col]

for col in sdg_cols:
    zcol = col + "_z"
    ncol = col + "_norm"

    col_min = industry[zcol].min()
    col_max = industry[zcol].max()

    # 분모 0 방지
    if col_max - col_min == 0:
        industry[ncol] = 0.5   # 전부 같은 값이면 0.5로 고정 (혹은 0)
    else:
        industry[ncol] = (industry[zcol] - col_min) / (col_max - col_min)



In [10]:
industry
col = "sector5"
new_order = [c for c in industry.columns if c != col] + [col]
industry = industry[new_order]
industry

Unnamed: 0_level_0,corp_code,SDG1_score,SDG2_score,SDG3_score,SDG4_score,SDG5_score,SDG6_score,SDG7_score,SDG8_score,SDG9_score,...,SDG9_score_norm,SDG10_score_norm,SDG11_score_norm,SDG12_score_norm,SDG13_score_norm,SDG14_score_norm,SDG15_score_norm,SDG16_score_norm,SDG17_score_norm,sector5
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
한국가스공사,036460,0.797814,0.010089,1.399328,0.128466,1.507148,0.857655,3.068082,0.766744,0.840807,...,0.315392,0.591365,0.376869,0.364195,0.563998,0.284765,0.487982,0.639305,0.229542,인프라
한국금융지주,071050,0.311616,0.032116,0.886480,0.135005,1.608665,0.136996,0.971525,0.258694,0.620390,...,0.358542,0.135564,0.409117,0.380391,0.216753,0.128644,0.299428,0.263599,0.152076,금융업
한국콜마,161890,0.320636,0.000000,1.295125,0.310995,1.926821,1.219584,2.059211,0.058638,0.244534,...,0.186344,0.127637,0.129641,0.586400,0.120650,0.202169,0.426859,0.333669,0.276350,제조업
한국타이어앤테크놀로지,161390,0.236174,0.076586,1.241747,0.806515,1.529499,0.607143,1.627691,0.073288,0.663464,...,0.409778,0.196710,0.470265,0.444835,0.148316,0.170834,0.402643,0.220573,0.176533,제조업
한독,002390,0.430277,0.058681,2.956586,0.542881,0.494303,1.104698,1.961131,0.058489,0.224071,...,0.175430,0.101079,0.307653,0.377247,0.167861,0.164448,0.309215,0.196520,0.348976,제조업
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
삼성화재,000810,0.870678,0.169495,1.518013,0.632016,0.822519,0.159941,1.572296,0.000906,0.613331,...,0.353765,0.462214,0.348549,0.394318,0.226133,0.579437,0.401595,0.255038,0.224198,금융업
삼양사,145990,0.343394,0.703633,0.819859,0.699965,0.591429,0.841591,1.567877,0.158236,0.282307,...,0.206490,0.167509,0.160413,0.466994,0.109594,0.196617,0.402641,0.141153,0.307289,제조업
삼양패키징,272550,0.397969,0.057223,1.192438,0.516899,0.577164,0.818137,2.025573,0.070682,0.309044,...,0.220750,0.150159,0.228326,0.744511,0.129176,0.171103,0.356174,0.207711,0.399363,제조업
삼양홀딩스,000070,0.383698,0.060432,1.817251,0.770838,0.493227,0.741822,1.315264,0.049989,0.252138,...,0.109364,0.268820,0.101283,0.754796,0.112849,0.291301,0.596206,0.188433,0.225419,금융업


In [11]:
df_company_clean = df_company_clean.set_index("company_name")
df_company_clean

Unnamed: 0_level_0,SDG1_score,SDG2_score,SDG3_score,SDG4_score,SDG5_score,SDG6_score,SDG7_score,SDG8_score,SDG9_score,SDG10_score,SDG11_score,SDG12_score,SDG13_score,SDG14_score,SDG15_score,SDG16_score,SDG17_score,Risk_Tag
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
한국가스공사,0.797814,0.010089,1.399328,0.128466,1.507148,0.857655,3.068082,0.766744,0.840807,0.877712,0.840780,1.240961,0.508511,1.911231,2.620563,5.282609,0.858825,0
한국금융지주,0.311616,0.032116,0.886480,0.135005,1.608665,0.136996,0.971525,0.258694,0.620390,0.228968,0.716199,0.307028,0.137265,0.345481,0.904003,1.640982,0.556785,1
한국콜마,0.320636,0.000000,1.295125,0.310995,1.926821,1.219584,2.059211,0.058638,0.244534,0.181525,0.067580,3.892290,0.020705,1.693374,2.497123,2.533604,0.851512,1
한국타이어앤테크놀로지,0.236174,0.076586,1.241747,0.806515,1.529499,0.607143,1.627691,0.073288,0.663464,0.293184,0.496314,2.972307,0.068558,1.247945,2.355460,1.687849,0.511464,2
한독,0.430277,0.058681,2.956586,0.542881,0.494303,1.104698,1.961131,0.058489,0.224071,0.138593,0.291639,2.533078,0.102364,1.157161,1.808905,1.507978,1.098929,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
삼성화재,0.870678,0.169495,1.518013,0.632016,0.822519,0.159941,1.572296,0.000906,0.613331,0.780681,0.601411,0.409946,0.143206,3.669336,1.253199,1.574602,0.819595,1
삼양사,0.343394,0.703633,0.819859,0.699965,0.591429,0.841591,1.567877,0.158236,0.282307,0.245981,0.106312,3.116314,0.001582,1.614454,2.355448,1.093937,0.956914,2
삼양패키징,0.397969,0.057223,1.192438,0.516899,0.577164,0.818137,2.025573,0.070682,0.309044,0.217932,0.191792,4.919797,0.035452,1.251770,2.083621,1.591667,1.270585,2
삼양홀딩스,0.383698,0.060432,1.817251,0.770838,0.493227,0.741822,1.315264,0.049989,0.252138,0.454037,0.132794,3.073727,0.071465,1.544803,1.918357,1.058138,0.824043,2


In [12]:

# ======================================
# 1) 필요한 컬럼명
# ======================================
user_cols = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
corp_cols = [f"SDG{i}_score" for i in range(1, 18)] + ["Risk_Tag"]

# ======================================
# 2) numpy 배열 변환
# ======================================
U = df_users[user_cols].to_numpy(float)          # shape (n_users, 18)
C = df_company_clean[corp_cols].to_numpy(float)  # shape (n_companies, 18)

# ======================================
# 3) 정규화 + 코사인 유사도 계산
# ======================================
U_norm = U / np.linalg.norm(U, axis=1, keepdims=True)
C_norm = C / np.linalg.norm(C, axis=1, keepdims=True)

sim_matrix = U_norm @ C_norm.T   # shape (n_users, n_companies)

# ======================================
# 4) Top-3 (P) / Bottom-3 (N) index 선택
# ======================================
top_k = 3
n_users, n_companies = sim_matrix.shape
company_names = df_company_clean.index.to_numpy()

# 유사도 높은 순 (음수 붙여서 sort)
pos_idx = np.argsort(-sim_matrix, axis=1)[:, :top_k]  
# 유사도 낮은 순
neg_idx = np.argsort(sim_matrix, axis=1)[:, :top_k]

# 회사명 매핑
P_companies = company_names[pos_idx]   # shape (n_users, 3)
N_companies = company_names[neg_idx]   # shape (n_users, 3)

# ======================================
# 5) df_users 뒤에 P1,P2,P3 / N1,N2,N3 추가
# ======================================
for i in range(top_k):
    df_users[f"P{i+1}"] = P_companies[:, i]
    df_users[f"N{i+1}"] = N_companies[:, i]

df_users.head()


Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,...,sdg 15,sdg 16,sdg 17,risk tag,P1,N1,P2,N2,P3,N3
user1,0.0,0.75,0.75,0.5,0.5,1.0,0.0,0.75,0.25,0.0,...,0.75,0.75,0.5,1.0,코오롱ENP,애경산업,롯데웰푸드,한화손해보험,우리금융지주,현대엘리베이터
user2,0.0,1.0,0.5,0.5,0.25,0.0,1.0,0.75,0.75,0.5,...,0.25,0.0,0.5,0.0,미래에셋증권,애경산업,삼성생명,현대엘리베이터,NH투자증권,HL홀딩스
user3,1.0,0.0,1.0,1.0,0.25,0.75,0.0,0.75,0.75,0.25,...,0.75,0.75,0.75,0.0,현대오토에버,애경산업,현대로템,현대엘리베이터,KCC,SK디앤디
user4,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.75,0.75,1.0,...,0.25,1.0,0.25,2.0,삼성증권,한화손해보험,현대엘리베이터,두산퓨얼셀,우리금융지주,HD현대중공업
user5,0.0,0.5,0.75,0.0,0.5,0.0,0.75,0.5,0.25,0.25,...,0.0,1.0,0.75,1.0,제주항공,HD현대중공업,메리츠금융지주,애경산업,SK텔레콤,한화손해보험


In [13]:

# ======================================
# 1) 필요한 컬럼명
# ======================================
user_cols = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
corp_cols = [f"SDG{i}_score_norm" for i in range(1, 18)] + ["Risk_Tag"]

# ======================================
# 2) numpy 배열 변환
# ======================================
U = df_users[user_cols].to_numpy(float)          # shape (n_users, 18)
C = industry[corp_cols].to_numpy(float)  # shape (n_companies, 18)

# ======================================
# 3) 정규화 + 코사인 유사도 계산
# ======================================
U_norm = U / np.linalg.norm(U, axis=1, keepdims=True)
C_norm = C / np.linalg.norm(C, axis=1, keepdims=True)

sim_matrix = U_norm @ C_norm.T   # shape (n_users, n_companies)

# ======================================
# 4) Top-3 (P) / Bottom-3 (N) index 선택
# ======================================
top_k = 3
n_users, n_companies = sim_matrix.shape
company_names = df_company_clean.index.to_numpy()

# 유사도 높은 순 (음수 붙여서 sort)
pos_idx = np.argsort(-sim_matrix, axis=1)[:, :top_k]  
# 유사도 낮은 순
neg_idx = np.argsort(sim_matrix, axis=1)[:, :top_k]

# 회사명 매핑
P_companies = company_names[pos_idx]   # shape (n_users, 3)
N_companies = company_names[neg_idx]   # shape (n_users, 3)

# ======================================
# 5) df_users 뒤에 P1,P2,P3 / N1,N2,N3 추가
# ======================================
for i in range(top_k):
    df_users[f"P{i+1}_adjusted"] = P_companies[:, i]
    df_users[f"N{i+1}_adjusted"] = N_companies[:, i]

df_users.head()


Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,...,P2,N2,P3,N3,P1_adjusted,N1_adjusted,P2_adjusted,N2_adjusted,P3_adjusted,N3_adjusted
user1,0.0,0.75,0.75,0.5,0.5,1.0,0.0,0.75,0.25,0.0,...,롯데웰푸드,한화손해보험,우리금융지주,현대엘리베이터,동원산업,애경산업,세아베스틸지주,현대엘리베이터,롯데이노베이트,세방전지
user2,0.0,1.0,0.5,0.5,0.25,0.0,1.0,0.75,0.75,0.5,...,삼성생명,현대엘리베이터,NH투자증권,HL홀딩스,두산에너빌리티,애경산업,한화,현대엘리베이터,미래에셋증권,하나투어
user3,1.0,0.0,1.0,1.0,0.25,0.75,0.0,0.75,0.75,0.25,...,현대로템,현대엘리베이터,KCC,SK디앤디,삼성중공업,애경산업,LG이노텍,현대엘리베이터,SKC,SK디앤디
user4,0.0,0.25,0.5,0.5,0.0,0.5,0.0,0.75,0.75,1.0,...,현대엘리베이터,두산퓨얼셀,우리금융지주,HD현대중공업,포스코홀딩스,OCI 홀딩스,JB금융지주,한화손해보험,삼성증권,두산퓨얼셀
user5,0.0,0.5,0.75,0.0,0.5,0.0,0.75,0.5,0.25,0.25,...,메리츠금융지주,애경산업,SK텔레콤,한화손해보험,코오롱인더,현대오토에버,한세실업,애경케미칼,LG디스플레이,OCI 홀딩스


In [14]:
#산업 보정없을 때 추천 기업수
count1 = len(np.unique(df_users['P1']))
count2 = len(np.unique(df_users['P2']))
count3 = len(np.unique(df_users['P3']))
print(count1,count2,count3)

90 97 94


In [15]:
#산업 보정있을 때 추천 기업수
count1 = len(np.unique(df_users['P1_adjusted']))
count2 = len(np.unique(df_users['P2_adjusted']))
count3 = len(np.unique(df_users['P3_adjusted']))
print(count1,count2,count3)

126 132 133


In [16]:
df_users['P1_adjusted'].value_counts()

P1_adjusted
롯데이노베이트    222
삼성화재       111
KB금융        70
세아베스틸지주     54
한화생명        48
          ... 
넥센타이어        1
LG생활건강       1
삼양홀딩스        1
LG유플러스       1
한샘           1
Name: count, Length: 126, dtype: int64

In [17]:
df_users['P1'].value_counts()

P1
삼성증권          396
NH투자증권        192
우리금융지주        153
현대오토에버        143
SK텔레콤         108
             ... 
SK네트웍스          1
LS ELCTRIC      1
한세실업            1
한일단조            1
SKC             1
Name: count, Length: 90, dtype: int64

In [18]:
a=df_users.to_csv("C:/Users/tkdgj/user_vector_with_PN_adjusted.csv")

In [19]:
b=industry.to_csv("C:/Users/tkdgj/corp_vector_industry_adjusted.csv")