In [3]:
import numpy as np
import pandas as pd

# ---------- 설정 ----------
n_users = 2000
sigma = 0.25           
rng = np.random.default_rng(42)

# SDG를 E/S/G 그룹으로 나눈 인덱스 (0-based)
# E: 6,7,11,12,13,14,15
E = [5, 6, 10, 11, 12, 13, 14]
# S: 1,2,3,4,5,8,10,17
S = [0, 1, 2, 3, 4, 7, 9, 16]
# G: 9,16
G = [8, 15]

# ---------- 1. 사용자 유형 분포 (E-heavy, S-heavy, Balanced, G-heavy) ----------
user_types = rng.choice(
    ["E", "S", "B", "G"],
    size=n_users,
    p=[0.5, 0.3, 0.15, 0.05]
)

# ---------- 2. full SDG 선호도(0~1) 생성 ----------
prefs = np.zeros((n_users, 17))

for i, t in enumerate(user_types):
    # 그룹별 평균 설정
    if t == "E":
        mu = {"E": 0.8,  "S": 0.6,  "G": 0.4}
    elif t == "S":
        mu = {"E": 0.6,  "S": 0.8,  "G": 0.4}
    elif t == "B":
        mu = {"E": 0.65, "S": 0.65, "G": 0.65}
    else:  # G-heavy
        mu = {"E": 0.5,  "S": 0.5, "G": 0.8}

    row = np.zeros(17)

    # 각 그룹별 정규분포에서 샘플링
    row[E] = rng.normal(mu["E"], sigma, len(E))
    row[S] = rng.normal(mu["S"], sigma, len(S))
    row[G] = rng.normal(mu["G"], sigma, len(G))

    # [0,1]로 클리핑
    row = np.clip(row, 0, 1)

    # 1~5 Likert로 매핑 후 다시 0~1로 변환
    #   raw ∈ [0,0.125] → Likert 1 → 최종 0
    row = np.round(row * 4 + 1)   # 1~5 점수
    row = (row - 1) / 4           # 0,0.25,0.5,0.75,1.0

    prefs[i] = row

# ---------- 3. 위험 성향 태그 생성 (0/1/2 = 위험/중립/안정 예시) ----------
risk = rng.choice(
    [0, 1, 2],
    size=n_users,
    p=[0.3, 0.4, 0.3]  # 위험형 30%, 중립 40%, 안정형 30%
)

# ---------- 4. DataFrame 생성 ----------
columns = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
df_users = pd.DataFrame(
    np.column_stack([prefs, risk]),
    columns=columns
)

# raw명 = user1 ~ user2000
df_users.index = [f"user{i}" for i in range(1, n_users + 1)]

print(df_users.head())


       sdg 1  sdg 2  sdg 3  sdg 4  sdg 5  sdg 6  sdg 7  sdg 8  sdg 9  sdg 10  \
user1   0.75   0.75   0.50   1.00   0.75   0.25   0.75   0.75   0.25    0.75   
user2   0.75   0.75   0.50   0.50   0.50   0.75   1.00   0.50   0.25    0.50   
user3   0.75   0.75   0.50   1.00   0.50   0.25   0.75   0.50   0.25    0.75   
user4   0.75   1.00   0.75   0.75   0.75   0.50   0.25   1.00   0.25    1.00   
user5   1.00   0.50   0.75   0.50   0.75   0.75   0.50   0.75   0.25    0.50   

       sdg 11  sdg 12  sdg 13  sdg 14  sdg 15  sdg 16  sdg 17  risk tag  
user1    0.25    0.50    0.50    0.75    0.50    0.25    1.00       0.0  
user2    0.50    1.00    0.25    0.75    1.00    0.25    0.75       1.0  
user3    0.75    0.75    1.00    0.75    0.75    1.00    0.75       1.0  
user4    1.00    0.75    0.25    1.00    0.75    0.50    0.75       1.0  
user5    0.50    1.00    0.25    0.75    0.75    0.50    0.75       1.0  


In [6]:
a=df_users.to_csv("C:/Users/tkdgj/user_vectors2.csv")

PermissionError: [Errno 13] Permission denied: 'C:/Users/tkdgj/user_vectors2.csv'

In [7]:
cor = pd.read_csv("C:/Users/tkdgj/251113_dataset_with_tag.csv")

In [8]:
cor

Unnamed: 0,company_name,corp_code,G01_mentions_per_1k_tokens,G02_mentions_per_1k_tokens,G03_mentions_per_1k_tokens,G04_mentions_per_1k_tokens,G05_mentions_per_1k_tokens,G06_mentions_per_1k_tokens,G07_mentions_per_1k_tokens,G08_mentions_per_1k_tokens,...,G09_reference_sentence,G10_reference_sentence,G11_reference_sentence,G12_reference_sentence,G13_reference_sentence,G14_reference_sentence,G15_reference_sentence,G16_reference_sentence,G17_reference_sentence,Risk_Tag
0,한국가스공사,36460,1.016065,0.054922,1.894824,0.192228,1.867362,1.290677,3.460113,0.823836,...,[TABLE p17 #1] / 대응목표 및 성과관리 온실가스 배출 저감 활동 목표 ...,[TABLE p45 #4] / 분야: 1. 인권경영체제의 구축 2. 고용상의 비차별...,[TABLE p58 #2] 가스 공급인프라의 무결성 / : 가스 공급인프라의 무결성,[TABLE p25 #3] • 건설폐기물 재활용률 및 건설자재 재활용 제품 사용률 ...,[TABLE p17 #1] / 대응목표 및 성과관리 온실가스 배출 저감 활동 목표 ...,"[TABLE p25 #2] • 환경경영시스템 개선 • 생 물다양성, 산림 보전 체계...",[TABLE p17 #1] / 대응목표 및 성과관리 온실가스 배출 저감 활동 목표 ...,[TABLE p45 #4] / 분야: 1. 인권경영체제의 구축 2. 고용상의 비차별...,[TABLE p35 #1] 〮 직무중심 교육체계 내실화 〮 디지털 역량 강화 / 직...,위험
1,한국금융지주,71050,0.408463,0.054462,1.252621,0.163385,2.314626,0.190616,1.606623,0.272309,...,[TABLE p20 #1] 핵심 중대 이슈 금융 사업 포트폴리오 다각화 / : 핵심...,"[TABLE p42 #3] · 채 용 시 지원자의 성별, · 성 과평가, 급여 및 ...","[TABLE p111 #3] 및 현금흐름에 미치는 영향과 단기, 중기, 장기에 걸친...","[TABLE p56 #3] / : 유형 신재생 에너지, 탄소 절감, 배터리 제조 및...","[TABLE p111 #3] 및 현금흐름에 미치는 영향과 단기, 중기, 장기에 걸친...","[TABLE p31 #2] 생물 다양성 / 배제영역: · 세 계문화유산, 국가 보호...","[TABLE p31 #2] 생물 다양성 / 배제영역: · 세 계문화유산, 국가 보호...",[TABLE p47 #1] 인권 존중 인적자본 관리 정보보안 및 개인정보 보호 책임...,"[TABLE p48 #2] / 일ᆞ과정 양립 ᆞ자녀학자금 지원(유치원, 고교, 대학...",중립
2,한국콜마,161890,0.334763,0.000000,1.820274,0.334763,2.427032,1.820274,2.782718,0.062768,...,[TABLE p38 #1] 인터넷 및 Mail 보안 / 출력보안: 인터넷 및 Mai...,[TABLE p104 #1] 임직원 신뢰 / ・신뢰받는 제품 제공 ・정직한 영업마케...,[TABLE p82 #1] 주거 / 항목: 주거,[TABLE p73 #1] 환경경영 3R 전략 RECYCLABLE RECYCLED ...,[TABLE p36 #1] 기후변화 대응체계 ESG경영 의사결정 체계 구축 탄소중립...,[TABLE p62 #1] 환경영향평가 에이치케이이노엔은 각 사업장이 위치한 지역사...,[TABLE p73 #1] 환경경영 3R 전략 RECYCLABLE RECYCLED ...,[TABLE p167 #1] 반부패 / : 한국콜마는 준법·윤리 경영을 강화하기 위...,[TABLE p95 #1] 드림커넥트 / 2024년 목표: 자립준비청년 역량 강화 ...,중립
3,한국타이어앤테크놀로지,161390,0.314103,0.078526,1.596691,0.863784,1.858444,0.968485,2.329599,0.078526,...,"[TABLE p85 #2] 지속가능한 산업, 혁신, 인프라 / : 지속가능한 산업,...",[TABLE p85 #1] 성평등 / : 성평등,"[TABLE p85 #2] 지속가능한 산업, 혁신, 인프라 / : 지속가능한 산업,...",[TABLE p82 #1] 자원 절약 및 효율성 향상 제품 제조 단계 전반에서의 에...,[TABLE p86 #1] / 주요 위험 요인: 화석연료 가격 변동폭 증가,"[TABLE p82 #3] 인권 존중 및 다양성 확보 임직원, 공급망, 지역사회 등...","[TABLE p82 #3] 인권 존중 및 다양성 확보 임직원, 공급망, 지역사회 등...","[TABLE p82 #3] 인권 존중 및 다양성 확보 임직원, 공급망, 지역사회 등...",[TABLE p82 #3] 인적자원 개발 및 관리 임직원 역량 강화를 위한 각종 교...,안전
4,한독,2390,0.448860,0.059848,3.710575,0.598480,0.688252,1.436352,2.423843,0.059848,...,"[TABLE p102 #1] / 지표: 2-6 활동, 조직의 가치사슬 및 기타 사업관계",[TABLE p105 #2] 207-1 조세(세금)에 대한 접근법 / 201-1 직...,"[TABLE p22 #1] 7 / 경제, 환경, 사회 영향: ・ (IN → OUT)...",[TABLE p21 #1] 2 / 재무적 영향: ・ (위기)폐기물 감축 및 분리배출...,"[TABLE p21 #1] 1 / 경제, 환경, 사회 영향: ・ (IN → OUT)...",[TABLE p90 #1] / : 수질 오염물질 배출량5),[TABLE p105 #2] 301-2 재생재료 사용량 / 201-1 직접적인 경제...,[TABLE p49 #1] ・ 제약산업에 특화된 범위를 확대하여 전방위적 윤리경영 ...,[TABLE p22 #1] 5 / 재무적 영향: ・ (위기)임직원 역량 강화와 공정...,안전
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,삼성화재,810,1.001240,0.238390,1.978640,0.691332,1.001240,0.286068,2.073996,0.023839,...,[TABLE p28 #5] CISO(위원장) 시스템운영파트장 IT보안파트장 인프라전...,"[TABLE p126 #3] 5.5 정치, 경제, 공공부문에서 모든 단계의 의사결정...",[TABLE p28 #5] CISO(위원장) 시스템운영파트장 IT보안파트장 인프라전...,[TABLE p124 #9] 306-4 / 지표의 정의: 재활용 또는 재사용된 폐기물,[TABLE p123 #8] 302-4 / 지표의 정의: 에너지 소비 감축,[TABLE p32 #1] 2024 Samsung Fire & Marine Insu...,[TABLE p127 #3] 15.9 2020년까지 생태계 및 생물다양성 가치를 국...,[TABLE p126 #3] 5.1 모든 곳에서 모든 여성과 여아에 대한 모든 형태...,"[TABLE p126 #6] 8.10 보험업, 금융업 및 금융지원서비스의 접근가능범...",중립
137,삼양사,145990,0.431942,0.809891,1.241834,0.728902,0.836888,1.241834,1.862750,0.161978,...,"[TABLE p106 #5] 조직 활동, 가치사슬 및 기타 사업관계 / 임직원: 조...",[TABLE p86 #1] / : 2023(실적) 93명 교육 진행,[TABLE p86 #1] / : 2023(실적) 93명 교육 진행,[TABLE p107 #14] 배출된 폐기물 / 폐기물: 배출된 폐기물,[TABLE p19 #3] / 재무적/비재무적 영향: 화석연료에 대한 규제 확대...,[TABLE p67 #1] 1. 보고사항  1.1ESG 비전 체계  1.22...,[TABLE p19 #3] / 재무적/비재무적 영향: 화석연료에 대한 규제 확대...,[TABLE p12 #4] 불공정행위 및 부패 방지 기업지배구조 보고서 공시 지속가...,[TABLE p47 #3] 안전보건목표 KPI 반영 확대 안전 담당자 전문역량 강화...,안전
138,삼양패키징,272550,0.466334,0.071744,1.757721,0.573950,0.860925,1.219643,2.259928,0.071744,...,"[TABLE p86 #5] 임직원 / 조직 활동, 가치사슬 및 기타 사업관계: 임직원",[TABLE p56 #1] / 2023(실적): 93명  교육 진행,[TABLE p56 #1] / 2023(실적): 93명  교육 진행,[TABLE p18 #5] 폐기물 재활용률 증가 유해화학물질 저감 노력 수자원...,[TABLE p18 #5] 폐기물 재활용률 증가 유해화학물질 저감 노력 수자원...,[TABLE p18 #2] 환경오염 통합 관리 자발적 폐기물 감소 유도 오염 저감을...,[TABLE p18 #2] 환경오염 통합 관리 자발적 폐기물 감소 유도 오염 저감을...,[TABLE p12 #4] 불 공정행위 및 부패 방지 기 업지배구조 보고서 ...,[TABLE p66 #1] Right People 선발·육성 및 코칭 역량 강화 /...,안전
139,삼양홀딩스,70,0.513514,0.108108,2.243243,0.810811,0.702703,1.027027,1.459459,0.054054,...,"[TABLE p112 #5] 조직 활동, 가치사슬 및 기타 사업관계 / 직원: 조직...",[TABLE p84 #3] / 2023(실적): 93명  교육 진행,[TABLE p84 #3] / 2023(실적): 93명  교육 진행,[TABLE p113 #11] 폐기물 발생 / : 폐기물: 폐기물 발생,[TABLE p84 #3] / 2023(실적): 93명  교육 진행,[TABLE p64 #1] 1. 보고사항  1.1ESG 비전 체계  1.22...,[TABLE p64 #1] 1. 보고사항  1.1ESG 비전 체계  1.22...,[TABLE p14 #4] 불 공정행위 및 부패 방지 기 업지배구조 보고서 ...,"[TABLE p65 #5] 기업지배구조 주주권리 확대,주주관여 관리 체...",안전


In [10]:
cor.columns

Index(['company_name', 'corp_code', 'G01_mentions_per_1k_tokens',
       'G02_mentions_per_1k_tokens', 'G03_mentions_per_1k_tokens',
       'G04_mentions_per_1k_tokens', 'G05_mentions_per_1k_tokens',
       'G06_mentions_per_1k_tokens', 'G07_mentions_per_1k_tokens',
       'G08_mentions_per_1k_tokens', 'G09_mentions_per_1k_tokens',
       'G10_mentions_per_1k_tokens', 'G11_mentions_per_1k_tokens',
       'G12_mentions_per_1k_tokens', 'G13_mentions_per_1k_tokens',
       'G14_mentions_per_1k_tokens', 'G15_mentions_per_1k_tokens',
       'G16_mentions_per_1k_tokens', 'G17_mentions_per_1k_tokens',
       'G01_sent_mean', 'G02_sent_mean', 'G03_sent_mean', 'G04_sent_mean',
       'G05_sent_mean', 'G06_sent_mean', 'G07_sent_mean', 'G08_sent_mean',
       'G09_sent_mean', 'G10_sent_mean', 'G11_sent_mean', 'G12_sent_mean',
       'G13_sent_mean', 'G14_sent_mean', 'G15_sent_mean', 'G16_sent_mean',
       'G17_sent_mean', 'G01_reference_sentence', 'G02_reference_sentence',
       'G03_reference_

In [15]:

df_company = cor.copy()

# 1) SDG 17개 score 생성
for i in range(1, 18):
    mention_col = f"G{i:02d}_mentions_per_1k_tokens"
    sent_col = f"G{i:02d}_sent_mean"
    new_col = f"SDG{i}_score"
    
    df_company[new_col] = df_company[mention_col] * df_company[sent_col]

# 2) 위험 태그 숫자형 변환
risk_map = {"안전": 2, "중립": 1, "위험": 0}

df_company["Risk_Tag"] = (
    df_company["Risk_Tag"].astype(str).str.strip().map(risk_map)
)


In [16]:
df_company

Unnamed: 0,company_name,corp_code,G01_mentions_per_1k_tokens,G02_mentions_per_1k_tokens,G03_mentions_per_1k_tokens,G04_mentions_per_1k_tokens,G05_mentions_per_1k_tokens,G06_mentions_per_1k_tokens,G07_mentions_per_1k_tokens,G08_mentions_per_1k_tokens,...,SDG8_score,SDG9_score,SDG10_score,SDG11_score,SDG12_score,SDG13_score,SDG14_score,SDG15_score,SDG16_score,SDG17_score
0,한국가스공사,36460,1.016065,0.054922,1.894824,0.192228,1.867362,1.290677,3.460113,0.823836,...,0.766744,0.840807,0.877712,0.840780,1.240961,0.508511,1.911231,2.620563,5.282609,0.858825
1,한국금융지주,71050,0.408463,0.054462,1.252621,0.163385,2.314626,0.190616,1.606623,0.272309,...,0.258694,0.620390,0.228968,0.716199,0.307028,0.137265,0.345481,0.904003,1.640982,0.556785
2,한국콜마,161890,0.334763,0.000000,1.820274,0.334763,2.427032,1.820274,2.782718,0.062768,...,0.058638,0.244534,0.181525,0.067580,3.892290,0.020705,1.693374,2.497123,2.533604,0.851512
3,한국타이어앤테크놀로지,161390,0.314103,0.078526,1.596691,0.863784,1.858444,0.968485,2.329599,0.078526,...,0.073288,0.663464,0.293184,0.496314,2.972307,0.068558,1.247945,2.355460,1.687849,0.511464
4,한독,2390,0.448860,0.059848,3.710575,0.598480,0.688252,1.436352,2.423843,0.059848,...,0.058489,0.224071,0.138593,0.291639,2.533078,0.102364,1.157161,1.808905,1.507978,1.098929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,삼성화재,810,1.001240,0.238390,1.978640,0.691332,1.001240,0.286068,2.073996,0.023839,...,0.000906,0.613331,0.780681,0.601411,0.409946,0.143206,3.669336,1.253199,1.574602,0.819595
137,삼양사,145990,0.431942,0.809891,1.241834,0.728902,0.836888,1.241834,1.862750,0.161978,...,0.158236,0.282307,0.245981,0.106312,3.116314,0.001582,1.614454,2.355448,1.093937,0.956914
138,삼양패키징,272550,0.466334,0.071744,1.757721,0.573950,0.860925,1.219643,2.259928,0.071744,...,0.070682,0.309044,0.217932,0.191792,4.919797,0.035452,1.251770,2.083621,1.591667,1.270585
139,삼양홀딩스,70,0.513514,0.108108,2.243243,0.810811,0.702703,1.027027,1.459459,0.054054,...,0.049989,0.252138,0.454037,0.132794,3.073727,0.071465,1.544803,1.918357,1.058138,0.824043


In [18]:
df_company['Risk_Tag']

0      0
1      1
2      1
3      2
4      2
      ..
136    1
137    2
138    2
139    2
140    2
Name: Risk_Tag, Length: 141, dtype: int64

In [20]:
df_users

Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,sdg 11,sdg 12,sdg 13,sdg 14,sdg 15,sdg 16,sdg 17,risk tag
user1,0.75,0.75,0.50,1.00,0.75,0.25,0.75,0.75,0.25,0.75,0.25,0.50,0.50,0.75,0.50,0.25,1.00,0.0
user2,0.75,0.75,0.50,0.50,0.50,0.75,1.00,0.50,0.25,0.50,0.50,1.00,0.25,0.75,1.00,0.25,0.75,1.0
user3,0.75,0.75,0.50,1.00,0.50,0.25,0.75,0.50,0.25,0.75,0.75,0.75,1.00,0.75,0.75,1.00,0.75,1.0
user4,0.75,1.00,0.75,0.75,0.75,0.50,0.25,1.00,0.25,1.00,1.00,0.75,0.25,1.00,0.75,0.50,0.75,1.0
user5,1.00,0.50,0.75,0.50,0.75,0.75,0.50,0.75,0.25,0.50,0.50,1.00,0.25,0.75,0.75,0.50,0.75,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user1996,0.75,0.50,1.00,1.00,1.00,1.00,0.75,1.00,0.25,0.50,0.00,0.25,0.75,0.50,0.50,0.50,0.75,2.0
user1997,0.75,1.00,0.25,0.75,0.50,0.75,0.75,0.50,0.25,0.75,1.00,0.75,0.75,1.00,1.00,0.75,1.00,2.0
user1998,0.75,0.75,0.50,0.50,0.50,0.50,1.00,0.25,0.75,0.25,0.50,0.75,0.75,0.75,0.00,0.50,0.75,1.0
user1999,1.00,1.00,0.00,0.50,0.50,0.75,0.75,1.00,0.75,0.25,0.75,0.50,0.75,0.50,0.50,1.00,0.00,2.0


In [None]:
df_company = df_company.set_index("company_name")

# ----------------------------
# 4) 필요한 컬럼(SDG1~17 + Risk_Tag)만 남기기
# ----------------------------
sdg_cols = [f"SDG{i}_score" for i in range(1, 18)]
keep_cols = sdg_cols + ["Risk_Tag"]

df_company_clean = df_company[keep_cols].copy()

In [22]:
df_company_clean

Unnamed: 0_level_0,SDG1_score,SDG2_score,SDG3_score,SDG4_score,SDG5_score,SDG6_score,SDG7_score,SDG8_score,SDG9_score,SDG10_score,SDG11_score,SDG12_score,SDG13_score,SDG14_score,SDG15_score,SDG16_score,SDG17_score,Risk_Tag
company_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
한국가스공사,0.797814,0.010089,1.399328,0.128466,1.507148,0.857655,3.068082,0.766744,0.840807,0.877712,0.840780,1.240961,0.508511,1.911231,2.620563,5.282609,0.858825,0
한국금융지주,0.311616,0.032116,0.886480,0.135005,1.608665,0.136996,0.971525,0.258694,0.620390,0.228968,0.716199,0.307028,0.137265,0.345481,0.904003,1.640982,0.556785,1
한국콜마,0.320636,0.000000,1.295125,0.310995,1.926821,1.219584,2.059211,0.058638,0.244534,0.181525,0.067580,3.892290,0.020705,1.693374,2.497123,2.533604,0.851512,1
한국타이어앤테크놀로지,0.236174,0.076586,1.241747,0.806515,1.529499,0.607143,1.627691,0.073288,0.663464,0.293184,0.496314,2.972307,0.068558,1.247945,2.355460,1.687849,0.511464,2
한독,0.430277,0.058681,2.956586,0.542881,0.494303,1.104698,1.961131,0.058489,0.224071,0.138593,0.291639,2.533078,0.102364,1.157161,1.808905,1.507978,1.098929,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
삼성화재,0.870678,0.169495,1.518013,0.632016,0.822519,0.159941,1.572296,0.000906,0.613331,0.780681,0.601411,0.409946,0.143206,3.669336,1.253199,1.574602,0.819595,1
삼양사,0.343394,0.703633,0.819859,0.699965,0.591429,0.841591,1.567877,0.158236,0.282307,0.245981,0.106312,3.116314,0.001582,1.614454,2.355448,1.093937,0.956914,2
삼양패키징,0.397969,0.057223,1.192438,0.516899,0.577164,0.818137,2.025573,0.070682,0.309044,0.217932,0.191792,4.919797,0.035452,1.251770,2.083621,1.591667,1.270585,2
삼양홀딩스,0.383698,0.060432,1.817251,0.770838,0.493227,0.741822,1.315264,0.049989,0.252138,0.454037,0.132794,3.073727,0.071465,1.544803,1.918357,1.058138,0.824043,2


In [23]:
user_cols = [f"sdg {i}" for i in range(1, 18)] + ["risk tag"]
corp_cols = [f"SDG{i}_score" for i in range(1, 18)] + ["Risk_Tag"]

# -------------------------
# 2) numpy 배열로 변환
# -------------------------
U = df_users[user_cols].to_numpy(float)          # (n_users, 18)
C = df_company_clean[corp_cols].to_numpy(float)  # (n_companies, 18)

# -------------------------
# 3) 코사인 유사도 계산
# -------------------------
U_norm = U / np.linalg.norm(U, axis=1, keepdims=True)
C_norm = C / np.linalg.norm(C, axis=1, keepdims=True)

sim_matrix = U_norm @ C_norm.T  # (n_users, n_companies)

# -------------------------
# 4) 사용자별 P(Top-1) / N(Bottom-1)
# -------------------------
pos_idx = np.argmax(sim_matrix, axis=1)
neg_idx = np.argmin(sim_matrix, axis=1)

company_names = df_company_clean.index.to_numpy()

P_companies = company_names[pos_idx]
N_companies = company_names[neg_idx]

# -------------------------
# 5) df_users 끝에 P, N 컬럼 추가
# -------------------------
df_users["P"] = P_companies
df_users["N"] = N_companies

df_users.head()

Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,sdg 11,sdg 12,sdg 13,sdg 14,sdg 15,sdg 16,sdg 17,risk tag,P,N
user1,0.75,0.75,0.5,1.0,0.75,0.25,0.75,0.75,0.25,0.75,0.25,0.5,0.5,0.75,0.5,0.25,1.0,0.0,현대오토에버,애경산업
user2,0.75,0.75,0.5,0.5,0.5,0.75,1.0,0.5,0.25,0.5,0.5,1.0,0.25,0.75,1.0,0.25,0.75,1.0,롯데웰푸드,애경산업
user3,0.75,0.75,0.5,1.0,0.5,0.25,0.75,0.5,0.25,0.75,0.75,0.75,1.0,0.75,0.75,1.0,0.75,1.0,우리금융지주,애경산업
user4,0.75,1.0,0.75,0.75,0.75,0.5,0.25,1.0,0.25,1.0,1.0,0.75,0.25,1.0,0.75,0.5,0.75,1.0,우리금융지주,애경산업
user5,1.0,0.5,0.75,0.5,0.75,0.75,0.5,0.75,0.25,0.5,0.5,1.0,0.25,0.75,0.75,0.5,0.75,1.0,우리금융지주,애경산업


In [25]:
df_users

Unnamed: 0,sdg 1,sdg 2,sdg 3,sdg 4,sdg 5,sdg 6,sdg 7,sdg 8,sdg 9,sdg 10,sdg 11,sdg 12,sdg 13,sdg 14,sdg 15,sdg 16,sdg 17,risk tag,P,N
user1,0.75,0.75,0.50,1.00,0.75,0.25,0.75,0.75,0.25,0.75,0.25,0.50,0.50,0.75,0.50,0.25,1.00,0.0,현대오토에버,애경산업
user2,0.75,0.75,0.50,0.50,0.50,0.75,1.00,0.50,0.25,0.50,0.50,1.00,0.25,0.75,1.00,0.25,0.75,1.0,롯데웰푸드,애경산업
user3,0.75,0.75,0.50,1.00,0.50,0.25,0.75,0.50,0.25,0.75,0.75,0.75,1.00,0.75,0.75,1.00,0.75,1.0,우리금융지주,애경산업
user4,0.75,1.00,0.75,0.75,0.75,0.50,0.25,1.00,0.25,1.00,1.00,0.75,0.25,1.00,0.75,0.50,0.75,1.0,우리금융지주,애경산업
user5,1.00,0.50,0.75,0.50,0.75,0.75,0.50,0.75,0.25,0.50,0.50,1.00,0.25,0.75,0.75,0.50,0.75,1.0,우리금융지주,애경산업
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user1996,0.75,0.50,1.00,1.00,1.00,1.00,0.75,1.00,0.25,0.50,0.00,0.25,0.75,0.50,0.50,0.50,0.75,2.0,아시아나항공,HD현대중공업
user1997,0.75,1.00,0.25,0.75,0.50,0.75,0.75,0.50,0.25,0.75,1.00,0.75,0.75,1.00,1.00,0.75,1.00,2.0,우리금융지주,한화손해보험
user1998,0.75,0.75,0.50,0.50,0.50,0.50,1.00,0.25,0.75,0.25,0.50,0.75,0.75,0.75,0.00,0.50,0.75,1.0,SK텔레콤,애경산업
user1999,1.00,1.00,0.00,0.50,0.50,0.75,0.75,1.00,0.75,0.25,0.75,0.50,0.75,0.50,0.50,1.00,0.00,2.0,우리금융지주,HD현대중공업


In [29]:
df_users['P'].value_counts()

P
우리금융지주     355
NH투자증권     288
삼성증권       265
현대오토에버     256
KCC        200
SK텔레콤      127
롯데웰푸드       84
삼성생명        56
아시아나항공      54
KB금융        53
세방전지        51
한화          41
한화투자증권      27
미래에셋증권      22
상상인증권       22
하이트진로       13
다올투자증권      10
현대로템        10
JB금융지주      10
대한항공         9
넷마블          8
하나금융지주       6
한화시스템        4
한미글로벌        3
코오롱ENP       3
한화생명         3
유한양행         3
신세계 I&C      2
GS리테일        2
롯데이노베이트      2
BNK금융        2
삼성화재         1
한국금융지주       1
한전KPS        1
KC그린홀딩스      1
메리츠금융지주      1
오뚜기          1
코오롱인더        1
삼양홀딩스        1
교보증권         1
Name: count, dtype: int64

In [31]:
count = len(np.unique(df_users['P']))
count

40

In [33]:
count2 = len(np.unique(df_users['N']))
count2

15

In [34]:
a=df_users.to_csv("C:/Users/tkdgj/user_vector_with_PN.csv")