In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler

import os
from dotenv import load_dotenv
import boto3
import io
import pymysql
import json

DB Connection

In [75]:
from sshtunnel import SSHTunnelForwarder

class SSHMySQLConnector:
    def __init__(self):
        self.ssh_host = None
        self.ssh_username = None
        self.ssh_password = None
        self.db_username = None
        self.db_password = None
        self.db_name = None
        self.tunnel = None
        self.connection = None

    def load_config_from_json(self, json_path):
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                config = json.load(f)
                self.ssh_host = config['ssh_host']
                self.ssh_username = config['ssh_username']
                self.ssh_password = config['ssh_password']
                self.db_username = config['db_username']
                self.db_password = config['db_password']
                self.db_name = config['db_name']
        except Exception as e:
            print("설정 JSON 로딩 실패:", e)

    def connect(self, insert=False):
        try:
            self.tunnel = SSHTunnelForwarder(
                (self.ssh_host, 22),
                ssh_username=self.ssh_username,
                ssh_password=self.ssh_password,
                remote_bind_address=('127.0.0.1', 3306),
            )
            self.tunnel.start()
            # insert 여부에 따라 cursorclass 설정
            connect_kwargs = {
                'host': '127.0.0.1',
                'port': self.tunnel.local_bind_port,
                'user': self.db_username,
                'password': self.db_password,
                'db': self.db_name,
            }
            if insert:
                connect_kwargs['cursorclass'] = pymysql.cursors.DictCursor
            self.connection = pymysql.connect(**connect_kwargs)
            print("DB 접속 성공")
        except Exception as e:
            print("SSH 또는 DB 연결 실패:", e)

    def execute_query(self, query):
        # 쿼리 실행 후 데이터를 DataFrame으로 반환
        return pd.read_sql_query(query, self.connection)

    def insert_query_with_lookup(self, table_name, data_list):
        try:
            with self.connection.cursor() as cursor:
                for data in data_list:
                    # 1. op_member에서 uid, user_id 조회
                    cursor.execute("""
                        SELECT uid, user_id, add1_connected FROM op_member
                        WHERE add1 = %s
                        LIMIT 1
                    """, (data['acnt_nm'],))
                    result = cursor.fetchone()
                    
                    if result:
                        data['member_uid'] = result['uid']
                        data['user_id'] = result['user_id']
                        data['is_connected'] = result['add1_connected']
                        # 향후에 ig_user_id가 추가가 된다면, 해당 부분도 확인해서 추가할 수 있게
                        # data['ig_user_id'] = result['ig_user_id']
                    else:
                        data['member_uid'] = 0
                        data['user_id'] = 'None'
                        data['is_connected'] = 'n'
                        # data['ig_user_id'] = 'None'
              

                    # 2. INSERT 쿼리 구성 및 실행
                    columns = ', '.join(data.keys())
                    placeholders = ', '.join([f"%({k})s" for k in data.keys()])
                    insert_sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
                    print(insert_sql)
                    cursor.execute(insert_sql, data)

                    print(f"inserted acnt_id: {data.get('acnt_id', 'N/A')}")

            self.connection.commit()
        except Exception as e:
            self.connection.rollback()
            print("INSERT 실패:", e)
    
    def close(self):
        if self.connection:
            self.connection.close()
        if self.tunnel:
            self.tunnel.stop()

def sendQuery(query):
        ssh = SSHMySQLConnector()
        ssh.load_config_from_json('C:/Users/ehddl/Desktop/업무/code/config/ssh_db_config.json')
        ssh.connect()
        results = ssh.execute_query(query)
        # print(results)
        # print(results.head())
        ssh.close()

        return results
    
def get_all_infos(): 

    query_sales_info = """
        SELECT o.uid, o.add1, s.*
        FROM op_mem_seller_statistics s
        JOIN (
            SELECT member_uid, MAX(regdate) AS max_regdate
            FROM op_mem_seller_statistics
            GROUP BY member_uid
        ) latest ON s.member_uid = latest.member_uid AND s.regdate = latest.max_regdate
        JOIN op_member o ON o.uid = s.member_uid
        JOIN S3_RECENT_USER_INFO_MTR u ON o.add1 = u.acnt_nm
        ORDER BY s.uid DESC
    """
    sales_info = sendQuery(query_sales_info)

    query_seller_interest_info = """
        SELECT
        o.user_id, o.ig_user_id, o.add1, s.interestcategory
        FROM op_member o
        left join op_mem_seller s on o.user_id=s.user_id
        where (o.ig_user_id != '' and o.ig_user_id is not null) or (o.add1 != '' and o.add1 is not null)
    """
    seller_interest_info = sendQuery(query_seller_interest_info)

    query_not_conn_user_main_category_info = """
        SELECT acnt_id, acnt_nm, main_category, top_3_category, is_connected
        FROM INSTAGRAM_USER_CATEGORY_LABELING
    """

    not_conn_user_main_category_info = sendQuery(query_not_conn_user_main_category_info)

    return sales_info, seller_interest_info, not_conn_user_main_category_info


In [76]:
sales_info, seller_interest_info, not_conn_user_main_category_info = get_all_infos()

DB 접속 성공
DB 접속 성공
DB 접속 성공


In [4]:
ssh = SSHMySQLConnector()
ssh.load_config_from_json('C:/Users/ehddl/Desktop/업무/code/config/ssh_db_config.json')
ssh.connect(True)

DB 접속 성공


Data Loading

In [17]:
def load_s3_instagram_data():

    load_dotenv()
    aws_access_key = os.getenv("aws_accessKey")
    aws_secret_key = os.getenv("aws_secretKey")
    region_name='ap-northeast-2'

    # S3 클라이언트 생성
    client = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=region_name)

    today = datetime.now()
    year, week, _ = today.isocalendar()

    bucket_name = 'flexmatch-data'

    table_list = ['RECENT_USER_INFO_MTR', 'TIME_SERIES_PROFILE_INFO', 'BY_USER_ID_MEDIA_DTL_INFO', 'BY_DATE_MEDIA_AGG_INFO']
    external_table_list = ['EXTERNAL_RECENT_USER_INFO_MTR', 'EXTERNAL_TIME_SERIES_PROFILE_INFO', 'EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO', 'EXTERNAL_BY_DATE_MEDIA_AGG_INFO']
    external_table_list_2 = ['EXTERNAL_2_RECENT_USER_INFO_MTR', 'EXTERNAL_2_TIME_SERIES_PROFILE_INFO', 'EXTERNAL_2_BY_USER_ID_MEDIA_DTL_INFO', 'EXTERNAL_2_BY_DATE_MEDIA_AGG_INFO']

    df_by_table_list = {}

    for table_name in external_table_list:
        prefix = f'instagram-data/tables/{table_name}/year={year}/week=30/' # 이번주가 없어서 week를 따로 x
        response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

        if 'Contents' in response:
            files = [content['Key'] for content in response['Contents'] if content['Key'].endswith('merged_data.parquet')]

            if not files:
                print(f"[Info] No merged_data.parquet found for table: {table_name}")
                continue

            # 각 파일 순회
            for file_key in files:
                try:
                    obj = client.get_object(Bucket=bucket_name, Key=file_key)
                    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
                    df_by_table_list[table_name] = df
                    print(f"[Success] Loaded {file_key} for table {table_name}")
                except Exception as e:
                    print(f"[Error] Failed to read {file_key} for table {table_name}: {e}")
        else:
            print(f"[Info] No files found under prefix: {prefix}")
        
    return df_by_table_list

df_by_table_list = load_s3_instagram_data()

external_recent_user_info_mtr = df_by_table_list['EXTERNAL_RECENT_USER_INFO_MTR']
external_time_series_profile_info = df_by_table_list['EXTERNAL_TIME_SERIES_PROFILE_INFO']
external_by_user_id_media_dtl_info = df_by_table_list['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']
external_by_date_media_agg_info = df_by_table_list['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']

# external_recent_user_info_mtr_2 = df_by_table_list['EXTERNAL_RECENT_USER_INFO_MTR']
# external_time_series_profile_info_2 = df_by_table_list['EXTERNAL_TIME_SERIES_PROFILE_INFO']
# external_by_user_id_media_dtl_info_2 = df_by_table_list['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']
# external_by_date_media_agg_info_2 = df_by_table_list['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']




[Success] Loaded instagram-data/tables/EXTERNAL_RECENT_USER_INFO_MTR/year=2025/week=30/merged_data.parquet for table EXTERNAL_RECENT_USER_INFO_MTR
[Success] Loaded instagram-data/tables/EXTERNAL_TIME_SERIES_PROFILE_INFO/year=2025/week=30/merged_data.parquet for table EXTERNAL_TIME_SERIES_PROFILE_INFO
[Success] Loaded instagram-data/tables/EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO/year=2025/week=30/merged_data.parquet for table EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO
[Success] Loaded instagram-data/tables/EXTERNAL_BY_DATE_MEDIA_AGG_INFO/year=2025/week=30/merged_data.parquet for table EXTERNAL_BY_DATE_MEDIA_AGG_INFO


In [19]:
recent_user_info_mtr = pd.concat([recent_user_info_mtr, external_recent_user_info_mtr], axis=0)
time_series_profile_info = pd.concat([time_series_profile_info, external_time_series_profile_info], axis=0)
by_user_id_media_dtl_info = pd.concat([by_user_id_media_dtl_info, external_by_user_id_media_dtl_info], axis=0)
by_date_media_agg_info = pd.concat([by_date_media_agg_info, external_by_date_media_agg_info], axis=0)

recent_user_info_mtr_2 = pd.concat([recent_user_info_mtr_2, external_recent_user_info_mtr_2], axis=0)
time_series_profile_info_2 = pd.concat([time_series_profile_info_2, external_time_series_profile_info_2], axis=0)
by_user_id_media_dtl_info_2 = pd.concat([by_user_id_media_dtl_info_2, external_by_user_id_media_dtl_info_2], axis=0)
by_date_media_agg_info_2 = pd.concat([by_date_media_agg_info_2, external_by_date_media_agg_info_2], axis=0)

In [26]:
def load_weekly_instagram_data(bucket_name, table_list, weeks_back=2, target_filename='merged_data.parquet'):
    # 환경 변수 로딩
    load_dotenv()
    aws_access_key = os.getenv("aws_accessKey")
    aws_secret_key = os.getenv("aws_secretKey")

    client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name='ap-northeast-2'
    )

    # 주차 리스트 생성 (현재 주 포함하여 `weeks_back`만큼)
    today = datetime.now()
    week_year_pairs = [
        (today - timedelta(weeks=w)).isocalendar()[:2]
        for w in range(weeks_back)
    ]

    # 결과 저장용 딕셔너리 초기화
    merged_data_by_table = {table_name: {} for table_name in table_list}

    # 주차별로 데이터 로딩
    for year_val, week_val in week_year_pairs:
        for table_name in table_list:
            prefix = f'instagram-data/tables/{table_name}/year={year_val}/week={week_val}/'
            response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

            if 'Contents' not in response:
                print(f"[Info] No files found under prefix: {prefix}")
                continue

            target_files = [
                content['Key']
                for content in response['Contents']
                if content['Key'].endswith(target_filename)
            ]

            if not target_files:
                print(f"[Info] No {target_filename} found for {table_name} week={week_val}")
                continue

            for file_key in target_files:
                try:
                    obj = client.get_object(Bucket=bucket_name, Key=file_key)
                    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
                    merged_data_by_table[table_name][week_val] = df
                    print(f"[Success] Loaded {file_key} for table {table_name}, week {week_val}")
                except Exception as e:
                    print(f"[Error] Failed to read {file_key} for {table_name}, week {week_val}: {e}")

    recent_weeks_data = {}
    for table_name, week_data in merged_data_by_table.items():
        sorted_weeks = sorted(week_data.keys())
        if len(sorted_weeks) >= 2:
            prev_week, current_week = sorted_weeks[-2], sorted_weeks[-1]
            recent_weeks_data[table_name] = {
                'prev_week': week_data[prev_week],
                'current_week': week_data[current_week]
            }
        else:
            print(f"[Warning] Not enough data for table {table_name} to determine prev/current weeks.")

    
    return recent_weeks_data


In [None]:
bucket_name = 'flexmatch-data'
table_list = [
    'RECENT_USER_INFO_MTR',
    'TIME_SERIES_PROFILE_INFO',
    'BY_USER_ID_MEDIA_DTL_INFO',
    'BY_DATE_MEDIA_AGG_INFO'
]
# table_list = ['EXTERNAL_RECENT_USER_INFO_MTR', 'EXTERNAL_TIME_SERIES_PROFILE_INFO', 'EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO', 'EXTERNAL_BY_DATE_MEDIA_AGG_INFO']

merged_data_by_table = load_weekly_instagram_data(bucket_name, table_list)

recent_user_info_mtr = merged_data_by_table['RECENT_USER_INFO_MTR']['prev_week']
time_series_profile_info = merged_data_by_table['TIME_SERIES_PROFILE_INFO']['prev_week']
by_user_id_media_dtl_info = merged_data_by_table['BY_USER_ID_MEDIA_DTL_INFO']['prev_week']
by_date_media_agg_info = merged_data_by_table['BY_DATE_MEDIA_AGG_INFO']['prev_week']

recent_user_info_mtr_2 = merged_data_by_table['RECENT_USER_INFO_MTR']['current_week']
time_series_profile_info_2 = merged_data_by_table['TIME_SERIES_PROFILE_INFO']['current_week']
by_user_id_media_dtl_info_2 = merged_data_by_table['BY_USER_ID_MEDIA_DTL_INFO']['current_week']
by_date_media_agg_info_2 = merged_data_by_table['BY_DATE_MEDIA_AGG_INFO']['current_week']

# recent_user_info_mtr = merged_data_by_table['EXTERNAL_RECENT_USER_INFO_MTR']['prev_week']
# time_series_profile_info = merged_data_by_table['EXTERNAL_TIME_SERIES_PROFILE_INFO']['prev_week']
# by_user_id_media_dtl_info = merged_data_by_table['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']['prev_week']
# by_date_media_agg_info = merged_data_by_table['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']['prev_week']

# recent_user_info_mtr_2 = merged_data_by_table['EXTERNAL_RECENT_USER_INFO_MTR']['current_week']
# time_series_profile_info_2 = merged_data_by_table['EXTERNAL_TIME_SERIES_PROFILE_INFO']['current_week']
# by_user_id_media_dtl_info_2 = merged_data_by_table['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']['current_week']
# by_date_media_agg_info_2 = merged_data_by_table['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']['current_week']


[Success] Loaded instagram-data/tables/RECENT_USER_INFO_MTR/year=2025/week=33/merged_data.parquet for table RECENT_USER_INFO_MTR, week 33
[Success] Loaded instagram-data/tables/TIME_SERIES_PROFILE_INFO/year=2025/week=33/merged_data.parquet for table TIME_SERIES_PROFILE_INFO, week 33
[Success] Loaded instagram-data/tables/BY_USER_ID_MEDIA_DTL_INFO/year=2025/week=33/merged_data.parquet for table BY_USER_ID_MEDIA_DTL_INFO, week 33
[Success] Loaded instagram-data/tables/BY_DATE_MEDIA_AGG_INFO/year=2025/week=33/merged_data.parquet for table BY_DATE_MEDIA_AGG_INFO, week 33
[Success] Loaded instagram-data/tables/RECENT_USER_INFO_MTR/year=2025/week=32/merged_data.parquet for table RECENT_USER_INFO_MTR, week 32
[Success] Loaded instagram-data/tables/TIME_SERIES_PROFILE_INFO/year=2025/week=32/merged_data.parquet for table TIME_SERIES_PROFILE_INFO, week 32
[Success] Loaded instagram-data/tables/BY_USER_ID_MEDIA_DTL_INFO/year=2025/week=32/merged_data.parquet for table BY_USER_ID_MEDIA_DTL_INFO, we

basic preprocessing

In [20]:
recent_user_info_mtr = recent_user_info_mtr.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
time_series_profile_info = time_series_profile_info.drop_duplicates(subset=['base_ymd', 'acnt_id'])
by_user_id_media_dtl_info = by_user_id_media_dtl_info.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
by_date_media_agg_info = by_date_media_agg_info.drop_duplicates(subset=['base_ymd', 'media_id'])

recent_user_info_mtr_2 = recent_user_info_mtr_2.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
time_series_profile_info_2 = time_series_profile_info_2.drop_duplicates(subset=['base_ymd', 'acnt_id'])
by_user_id_media_dtl_info_2 = by_user_id_media_dtl_info_2.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
by_date_media_agg_info_2 = by_date_media_agg_info_2.drop_duplicates(subset=['base_ymd', 'media_id'])


In [21]:
recent_user_info_mtr_2['acnt_id'] = recent_user_info_mtr_2['acnt_id'].astype(str)
time_series_profile_info_2['acnt_id'] = time_series_profile_info_2['acnt_id'].astype(str)

가장 최근 데이터로 isin 해줘야 할 것 같음. 그래야 1주일 전, 후를 비교할 수 있기 때문임

In [22]:
# 각 acnt_id가 있는 전부 있는 경우만 가능, isin으로 제거
# 근데 전 주차에는 없는 유저가 생길수도 있음
# media 같은 경우에도 최근 25개만 일단 가져오고 있기 때문에 일주일 단위여도 없을수도 있음

# 이렇게 해야하는 이유는 timeseries 변수를 계산하려면 무조건 두 주차에 같은 아이디가 있어야 함
# 또한, user info랑 media info 같이 써야하는 경우에는 어짜피 한번 거르고 있을 듯
# -> 따라서, 굳이 필수적으로 하지는 않아도 될듯함

unique_user = recent_user_info_mtr['acnt_id'].unique()

time_series_profile_info = time_series_profile_info[time_series_profile_info['acnt_id'].isin(unique_user)]
by_user_id_media_dtl_info = by_user_id_media_dtl_info[by_user_id_media_dtl_info['acnt_id'].isin(unique_user)]
time_series_profile_info_2 = time_series_profile_info_2[time_series_profile_info_2['acnt_id'].isin(unique_user)]
by_user_id_media_dtl_info_2 = by_user_id_media_dtl_info_2[by_user_id_media_dtl_info_2['acnt_id'].isin(unique_user)]

unique_media = by_user_id_media_dtl_info_2['media_id'].unique()
by_date_media_agg_info_2 = by_date_media_agg_info_2[by_date_media_agg_info_2['media_id'].isin(unique_media)]

In [23]:
len(unique_user)

7141

In [24]:
def influencer_scale_type(row):
    count = row['follower_cnt']
    if count < 1000:
        return 'nano'
    elif 1000 <= count <= 10000:
        return 'micro'
    elif 10000 < count <= 100000:
        return 'mid'
    elif 100000 < count <= 500000:
        return 'macro'
    else:
        return 'mega'

recent_user_info_mtr.loc[:, 'influencer_scale_type'] = recent_user_info_mtr.apply(influencer_scale_type, axis=1)
recent_user_info_mtr_2.loc[:,'influencer_scale_type'] = recent_user_info_mtr_2.apply(influencer_scale_type, axis=1)


create merged file

In [25]:
def create_merged_df(user_info_df, timeseries_df, timeseries_df_2, media_info_df, media_agg_df):
    media_engagement_merged_df = pd.merge(media_info_df, media_agg_df, on='media_id', how='outer')
    # print(len(media_engagement_merged_df['acnt_id'].unique()))

    ## 방법 1
    # 단 한개의 게시물이라도 like가 비공개인 influencer 제거
    # by_user_na_like_count = media_engagement_merged_df[media_engagement_merged_df['like_cnt'].isna()].groupby(['acnt_id'])['media_id'].count()
    # na_like_user = by_user_na_like_count[by_user_na_like_count > 0].index
    # media_engagement_merged_df = media_engagement_merged_df[~media_engagement_merged_df['acnt_id'].isin(na_like_user)].reset_index()

    ## 방법 2
    no_media_user = user_info_df[user_info_df['media_cnt'] == 0]['acnt_id'].to_list()
    media_engagement_merged_df = media_engagement_merged_df[~media_engagement_merged_df['acnt_id'].isin(no_media_user)].reset_index()

    media_engagement_merged_groupby_df = media_engagement_merged_df.groupby('acnt_id')[['like_cnt', 'cmnt_cnt']].mean()
    media_engagement_merged_groupby_df = np.ceil(media_engagement_merged_groupby_df)
    fillna_user = media_engagement_merged_groupby_df[media_engagement_merged_groupby_df['like_cnt'] > 1].index

    media_engagement_merged_df = media_engagement_merged_df[media_engagement_merged_df['acnt_id'].isin(fillna_user)].reset_index()

    engagement_cols = ['like_cnt', 'cmnt_cnt']
    for col in engagement_cols:
        media_engagement_merged_df[col] = media_engagement_merged_df.apply(
        lambda row: media_engagement_merged_groupby_df.at[row['acnt_id'], col] if pd.isna(row[col]) else row[col], axis=1)

    user_list = media_engagement_merged_df['acnt_id'].unique()
    media_list = media_engagement_merged_df['media_id'].unique()

    # merge하면서 제거된 리스트가 있기 때문에, 해당 부분 다시 삭제 후에 새로운 merge 파일 생성
    user_info = user_info_df[user_info_df['acnt_id'].isin(user_list)]
    timeseries = timeseries_df[timeseries_df['acnt_id'].isin(user_list)]
    timeseries_2 = timeseries_df_2[timeseries_df_2['acnt_id'].isin(user_list)]
    media_info = media_info_df[media_info_df['acnt_id'].isin(user_list)]
    media_agg = media_agg_df[media_agg_df['media_id'].isin(media_list)]

    all_merged_df_a = pd.merge(user_info, timeseries, on='acnt_id')
    all_merged_df_b = pd.merge(all_merged_df_a, media_info, on='acnt_id')
    all_merged_df = pd.merge(all_merged_df_b, media_agg, on='media_id')
    
    media_engagement_profile_merged_df = pd.merge(media_engagement_merged_df, user_info_df, on='acnt_id')
    time_series_merged_df = pd.merge(timeseries, timeseries_df_2, on='acnt_id')

    return user_info, timeseries, timeseries_2, media_info, media_agg, all_merged_df, media_engagement_merged_df, media_engagement_profile_merged_df, time_series_merged_df

In [26]:
user_info, timeseries, timeseries_2, media_info, media_agg, all_merged_df, media_engagement_merged_df, media_engagement_profile_merged_df, time_series_merged_df = create_merged_df(
                                                                                                                        recent_user_info_mtr_2,
                                                                                                                        time_series_profile_info,
                                                                                                                        time_series_profile_info_2,
                                                                                                                        by_user_id_media_dtl_info_2,
                                                                                                                        by_date_media_agg_info_2)

크리에이터 활동성

- 실제 게시물 업로드 날짜를 기준으로 계산할 수 있을 것 같음
- 최근 25개의 게시물의 평균 게시 간격을 계산
- 최근 25개의 게시물을 며칠에 걸쳐서 업로드 했는지에 대한 값을 계산

In [72]:
# media_dtl_2_copy = by_user_id_media_dtl_info_2.copy()
# media_dtl_2_copy = media_dtl_2_copy.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
# media_dtl_2_copy['reg_dt'] = pd.to_datetime(media_dtl_2_copy['reg_dt'])
# media_dtl_2_copy = media_dtl_2_copy.sort_values(['acnt_id', 'reg_dt'])

# # 게시물 간격 계산
# media_dtl_2_copy['prev_reg_dt'] = media_dtl_2_copy.groupby('acnt_id')['reg_dt'].shift(1)
# media_dtl_2_copy['gap_days'] = (media_dtl_2_copy['reg_dt'] - media_dtl_2_copy['prev_reg_dt']).dt.days
# # media_dtl_2_copy

# # gap_days가 NaN인 첫 번째 포스트 제외 후 평균 간격 계산
# activity_df = media_dtl_2_copy.dropna(subset=['gap_days']).groupby('acnt_id')['gap_days'].mean().reset_index()
# activity_df.rename(columns={'gap_days': 'avg_upload_interval'}, inplace=True)

# # 활동성 점수 계산 (간격의 역수로 환산) -> 점수 정규화 (업로드 간격이 짧을수록 점수가 높아지도록 역수를 취해서 계산한 것)
# activity_df['activity_score'] = 100 / activity_df['avg_upload_interval']
# activity_df

In [27]:
def calculate_activity_score(recent_media_dtl_df): # 두 개의 테이블 중 가장 최근
    media_dtl_copy = recent_media_dtl_df.copy()
    media_dtl_copy = media_dtl_copy.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
    media_dtl_copy['reg_dt'] = pd.to_datetime(media_dtl_copy['reg_dt'])
    media_dtl_copy = media_dtl_copy.sort_values(['acnt_id', 'reg_dt'])

    # 게시물 간격 계산
    media_dtl_copy['prev_reg_dt'] = media_dtl_copy.groupby('acnt_id')['reg_dt'].shift(1)
    media_dtl_copy['gap_days'] = (media_dtl_copy['reg_dt'] - media_dtl_copy['prev_reg_dt']).dt.days

    # gap_days가 NaN인 첫 번째 포스트 제외 후 평균 간격 계산
    activity_df = media_dtl_copy.dropna(subset=['gap_days']).groupby('acnt_id')['gap_days'].mean().reset_index()
    activity_df.rename(columns={'gap_days': 'avg_upload_interval'}, inplace=True)

    # 활동성 점수 계산 (간격의 역수로 환산) -> 점수 정규화 (업로드 간격이 짧을수록 점수가 높아지도록 역수를 취해서 계산한 것)
    activity_df['avg_upload_interval'] = activity_df['avg_upload_interval'].replace(0, 0.1)
    # activity_df['activity_score'] = activity_df['avg_upload_interval'] / 100
    ## 업로드 간격이 너무 짧은 유저의 경우 inf로 계산되는 것을 방지하기 위해서 scaling 진행
    # activity_df['activity_score'] = 100 / activity_df['avg_upload_interval']
    ## 너무 자주 올리면 오히려 게시물의 품질이 떨어질 수 있으므로 패널티를 주는 방식
    # activity_df['activity_score_penalty'] = 1 / (activity_df['avg_upload_interval'] + 1)
    activity_df['activity_score'] = (1 / activity_df['avg_upload_interval']) * 100

    return activity_df

In [28]:
activity_df = calculate_activity_score(media_info)

In [29]:
def check_inf(df):
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns

    mask_inf = np.isinf(df[float_cols]).any(axis=1)
    mask_neginf = np.isneginf(df[float_cols]).any(axis=1)

    invalid_rows = df[mask_inf | mask_neginf]

    print(f"⚠️ inf / -inf 포함 행 개수: {len(invalid_rows)}개")
    display(invalid_rows)

In [30]:
check_inf(activity_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,avg_upload_interval,activity_score


트렌드지수 - 팔로워 순증감률 (follower_growth_rate)

In [77]:
# time_series_profile_info_2['acnt_id'] = time_series_profile_info_2['acnt_id'].astype(object)

# influencer_list = time_series_profile_info['acnt_id'].unique()
# time_series_profile_info_2[time_series_profile_info_2['acnt_id'].isin(influencer_list)]
# time_series_merged_df = pd.merge(time_series_profile_info, time_series_profile_info_2, on='acnt_id')

# time_series_merged_df['follow_growth_rate'] = ((time_series_merged_df['follower_cnt_y'] - time_series_merged_df['follower_cnt_x']) / (time_series_merged_df['follower_cnt_x'])) * 100
# time_series_merged_df[['acnt_id', 'follow_growth_rate']]

In [31]:
def calculate_follower_growth_rate(time_series_df, recent_time_series_df):
    time_series_df.loc[:, 'acnt_id'] = time_series_df['acnt_id'].astype(object)
    recent_time_series_df.loc[:, 'acnt_id'] = recent_time_series_df['acnt_id'].astype(object)

    influencer_list = time_series_df['acnt_id'].unique()
    recent_time_series_df = recent_time_series_df[recent_time_series_df['acnt_id'].isin(influencer_list)]
    time_series_merged_df = pd.merge(time_series_df, recent_time_series_df, on='acnt_id')

    time_series_merged_df['follow_growth_rate'] = ((time_series_merged_df['follower_cnt_y'] - time_series_merged_df['follower_cnt_x']) / (time_series_merged_df['follower_cnt_x'])) * 100
    # time_series_merged_df['follow_growth_rate'] = ((np.log1p(time_series_merged_df['follower_cnt_y']) - np.log1p(time_series_merged_df['follower_cnt_x'])) / np.log1p(time_series_merged_df['follower_cnt_x'])) * 100
    
    growth_rate_df = time_series_merged_df[['acnt_id', 'follow_growth_rate']]

    return growth_rate_df

In [32]:
growth_rate_df = calculate_follower_growth_rate(timeseries, timeseries_2)
growth_rate_df

Unnamed: 0,acnt_id,follow_growth_rate
0,17841405733424155,0.022193
1,17841402223583704,0.704415
2,17841401640546975,-0.045767
3,17841403089862976,-0.023128
4,17841450345136721,0.284900
...,...,...
6278,17841403970237139,0.911827
6279,17841402073953165,0.093721
6280,17841449637462181,-0.132685
6281,17841402334876637,-0.068456


##### 팔로워 참여도

- 26일 데이터를 기준으로 계산 진행
- 단일 프소트 기준이 아닌 전체 포스트 평균을 측정
- media_dtl & media_agg & user_data를 병합한 데이터

---------------------------------------------------------

- 좋아요 비공개인 경우에는 각각의 중앙값으로 대체 -> 문제는 아예 없는 사람은 어떻게 해야하는건지...
- estimated_total_engagement = avg_engagement_per_post * total_post_count
- avg_engagement_per_post = like+comments / 25
- total_post_count -> user_data

In [80]:
# media_engagement_merged_df = pd.merge(media_dtl, media_agg, on='media_id')


# # 해당 부분 수정 -> dic에 자동으로 median 값이 들어갈 수 있도록 수정
# media_engagement_merged_df.groupby(['acnt_id'])['like_cnt'].median()
# fillna_dict = {
#     '17841400591698216': 810,
#     '17841401506106699': 791,
#     '17841401839008777': 480,
#     '17841402936102997': 96,
#     '17841406083664639': 83,
#     '17841400361359004' : 0,
#     '17841401473518226' : 0
# }

# for acnt_id, default_like in fillna_dict.items():
#     condition = media_engagement_merged_df['acnt_id'] == acnt_id
#     media_engagement_merged_df.loc[condition, 'like_cnt'] = (
#         media_engagement_merged_df.loc[condition, 'like_cnt'].fillna(default_like)
#     )

# media_engagement_merged_df = pd.merge(media_engagement_merged_df, user_info_2,  on='acnt_id', how = 'left')
# media_engagement_merged_df_2 = media_engagement_merged_df[['acnt_id', 'media_id', 'follower_cnt', 'follow_cnt', 'like_cnt', 'cmnt_cnt', 'media_cnt']]

# engaged_df = media_engagement_merged_df_2.groupby(['acnt_id']).agg({
#     'like_cnt' : 'sum',
#     'cmnt_cnt' : 'sum',
#     'media_cnt': 'first',
#     'follower_cnt' : 'first',
#     'follow_cnt' : 'first'
# }).reset_index()

# engaged_df['avg_engagement_per_post'] = ((engaged_df['like_cnt'] + engaged_df['cmnt_cnt']) / 25)
# engaged_df['estimated_total_engagement'] = engaged_df['avg_engagement_per_post'] * engaged_df['media_cnt']
# engaged_df['follower_total_engagement'] = (engaged_df['estimated_total_engagement'] / engaged_df['follower_cnt']) * 100
# engaged_df

In [81]:
# def calculate_follower_engagement(media_engagement_profile_merged_df):
#     media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df[['acnt_id', 'media_id', 'follower_cnt', 'follow_cnt', 'like_cnt', 'cmnt_cnt', 'media_cnt']]
    
#     engaged_df = media_engagement_profile_merged_df_copy.groupby(['acnt_id']).agg({
#         'like_cnt' : 'sum',
#         'cmnt_cnt' : 'sum',
#         'media_cnt': 'first',
#         'follower_cnt' : 'first',
#         'follow_cnt' : 'first'
#     }).reset_index()

#     # engaged_df['avg_engagement_per_post'] = ((engaged_df['like_cnt'] + engaged_df['cmnt_cnt']) / 25)
#     # engaged_df['estimated_total_engagement'] = engaged_df['avg_engagement_per_post'] * engaged_df['media_cnt']
#     # engaged_df['follower_total_engagement'] = (engaged_df['estimated_total_engagement'] / engaged_df['follower_cnt']) * 100

#     engaged_df['avg_engagement_per_post'] = ((engaged_df['like_cnt'] + engaged_df['cmnt_cnt']) / engaged_df['media_cnt']*engaged_df['follower_cnt'])
#     engaged_df['follower_total_engagement'] = engaged_df['avg_engagement_per_post'] * 100
    
#     follower_engagment_df = engaged_df

#     return follower_engagment_df

In [82]:
# follower_engagment_df = calculate_follower_engagement(media_engagement_profile_merged_df)

In [83]:
# check_inf(follower_engagment_df)

##### 팔로워 충성도

- user_info or time_series 데이터가 필요
- 트렌드 지수는 time_series 데이터를 가지고 단순히 빼기를 진행했었음

In [84]:
# time_series_merged_df_copy = time_series_merged_df[['acnt_id', 'follower_cnt_x', 'follower_cnt_y']]

# time_series_merged_df_copy['follower_change'] = (time_series_merged_df_copy['follower_cnt_y'] - time_series_merged_df_copy['follower_cnt_x'])

# def estimate_new_follower(row):
#     if row['follower_change'] < 0:
#         return 0
#     else:
#         return row['follower_change']

# time_series_merged_df_copy['new_follower'] = time_series_merged_df_copy.apply(estimate_new_follower, axis=1)

# # 충성도 계산
# # time_series_merged_df_copy = time_series_merged_time_series_merged_df_copy_copy.copy()
# time_series_merged_df_copy['unfollowed'] = time_series_merged_df_copy['follower_cnt_x'] + time_series_merged_df_copy['new_follower'] - time_series_merged_df_copy['follower_cnt_y']
# time_series_merged_df_copy['follower_retention_rate'] = ((time_series_merged_df_copy['follower_cnt_x'] - time_series_merged_df_copy['unfollowed']) / time_series_merged_df_copy['follower_cnt_x']) * 100
# time_series_merged_df_copy['follower_retention_rate'] = time_series_merged_df_copy['follower_retention_rate'].round(2)

# time_series_merged_df_copy

In [85]:
# def estimate_loyalty(row):
#     if row['follower_change'] < 0:
#         return ((row['follower_cnt_x'] + row['follower_change']) / row['follower_cnt_x']) * 100
#     else:
#         return 100.0

# time_series_merged_df_copy['follower_loyalty'] = time_series_merged_df_copy.apply(estimate_loyalty, axis=1)
# time_series_merged_df_copy['follower_loyalty'] = time_series_merged_df_copy['follower_loyalty'].round(2)


In [33]:
def calculate_follower_loyalty(time_series_merged_df):
    time_series_merged_df_copy = time_series_merged_df[['acnt_id', 'follower_cnt_x', 'follower_cnt_y']].copy()

    time_series_merged_df_copy.loc[:, 'follower_change'] = (time_series_merged_df_copy['follower_cnt_y'] - time_series_merged_df_copy['follower_cnt_x'])

    def estimate_new_follower(row):
        if row['follower_change'] < 0:
            return 0
        else:
            return row['follower_change']

    time_series_merged_df_copy.loc[:, 'new_follower'] = time_series_merged_df_copy.apply(estimate_new_follower, axis=1)
    time_series_merged_df_copy.loc[:, 'unfollowed'] = time_series_merged_df_copy['follower_cnt_x'] + time_series_merged_df_copy['new_follower'] - time_series_merged_df_copy['follower_cnt_y']
    time_series_merged_df_copy.loc[:, 'follower_retention_rate'] = ((time_series_merged_df_copy['follower_cnt_x'] - time_series_merged_df_copy['unfollowed']) / time_series_merged_df_copy['follower_cnt_x']) * 100
    time_series_merged_df_copy.loc[:, 'follower_retention_rate'] = time_series_merged_df_copy['follower_retention_rate'].round(2)

    follower_loyalty_df = time_series_merged_df_copy

    return follower_loyalty_df

In [34]:
follower_loyalty_df = calculate_follower_loyalty(time_series_merged_df)
follower_loyalty_df

Unnamed: 0,acnt_id,follower_cnt_x,follower_cnt_y,follower_change,new_follower,unfollowed,follower_retention_rate
0,17841405733424155,49566,49577,11,11,0,100.00
1,17841402223583704,70697,71195,498,498,0,100.00
2,17841401640546975,2185,2184,-1,0,1,99.95
3,17841403089862976,34590,34582,-8,0,8,99.98
4,17841450345136721,702,704,2,2,0,100.00
...,...,...,...,...,...,...,...
6278,17841403970237139,121295,122401,1106,1106,0,100.00
6279,17841402073953165,4268,4272,4,4,0,100.00
6280,17841449637462181,2261,2258,-3,0,3,99.87
6281,17841402334876637,14608,14598,-10,0,10,99.93


In [40]:
check_inf(follower_loyalty_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,follower_cnt_x,follower_cnt_y,follower_change,new_follower,unfollowed,follower_retention_rate


콘텐츠 효율성 평가

- 콘텐츠 1개당 팔로워 대비 반응량(like+comments)
- media_agg & media_dtl & acnt_id가 필요 -> 전부 병합한 게 media_engagement_merged_df

In [89]:
# media_engagement_merged_df_3 = media_engagement_profile_merged_df.copy()
# # media_engagment_merged_df_2 == media_engagement_profile_merged_df 같은 df임
# # media_engagement_profile_merged_df_copy = media_engagement_merged_df_2.copy()


# media_engagement_merged_df_3['post_efficiency'] = ((media_engagement_merged_df_3['like_cnt'] + media_engagement_merged_df_3['cmnt_cnt']) / media_engagement_merged_df_3['follower_cnt']) * 100
# post_efficiency_df = media_engagement_merged_df_3.groupby('acnt_id')['post_efficiency'].mean().reset_index()
# post_efficiency_df.rename(columns={'post_efficiency': 'avg_post_efficiency'}, inplace=True)
# post_efficiency_df

In [35]:
def calculate_post_efficiency_df(media_engagement_profile_merged_df):
    media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df.copy()

    media_engagement_profile_merged_df_copy['post_efficiency'] = ((media_engagement_profile_merged_df_copy['like_cnt'] + media_engagement_profile_merged_df_copy['cmnt_cnt']) / media_engagement_profile_merged_df_copy['follower_cnt']) * 100
    post_efficiency_df = media_engagement_profile_merged_df_copy.groupby('acnt_id')['post_efficiency'].mean().reset_index()
    post_efficiency_df.rename(columns={'post_efficiency': 'avg_post_efficiency'}, inplace=True)

    return post_efficiency_df

In [36]:
user_info[user_info['acnt_id']=='17841401218998222']

Unnamed: 0,acnt_id,acnt_nm,web_addr,acnt_sub_nm,intro_txt,profile_photo_url_addr,acnt_conn_yn,category_nm,follower_cnt,follow_cnt,media_cnt,influencer_scale_type


In [37]:
post_efficiency_df = calculate_post_efficiency_df(media_engagement_profile_merged_df)
post_efficiency_df

Unnamed: 0,acnt_id,avg_post_efficiency
0,17841400000114584,1.060444
1,17841400001061294,0.499417
2,17841400002751973,0.118471
3,17841400003230199,5.113817
4,17841400003646457,2.920919
...,...,...
6278,17841473488240292,33.955602
6279,17841473857435791,11.588172
6280,17841474311083136,27.777778
6281,17841474495202295,3.353625


In [43]:
check_inf(post_efficiency_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,avg_post_efficiency


광고효율성

In [77]:
not_conn_user_main_category_info['add1'] = not_conn_user_main_category_info['acnt_nm']

In [78]:
db_merged_data = pd.merge(sales_info, not_conn_user_main_category_info, on='add1')

In [79]:
db_merged_data = db_merged_data[['uid', 'user_id', 'member_uid', 'add1', 'acnt_id', 'acnt_nm', 'storeid', 'total_visit', 'total_order', 'match_total_price']]
db_merged_data

Unnamed: 0,uid,uid.1,user_id,member_uid,add1,acnt_id,acnt_nm,storeid,total_visit,total_order,match_total_price
0,5196,254254,samelee48,5196,__dongx2_,17841450980480576,__dongx2_,samelee48,452,20,1566750
1,5546,254123,test_design,5546,_movingday,17841401293139137,_movingday,test_design,165,2,125000
2,5616,254116,alsdud0907,5616,gogojiyul_,17841406857146322,gogojiyul_,alsdud0907,1034,72,2815900
3,5655,254109,junjuly4,5655,nanmoosu,17841401390908388,nanmoosu,,0,0,0
4,5844,254033,heihwi,5844,heihwi,17841449549923448,heihwi,heihwi,378,0,0
...,...,...,...,...,...,...,...,...,...,...,...
259,12427,250957,minawriter@naver.com,12427,twinkle_star_0710,17841449994562531,twinkle_star_0710,,0,0,0
260,12436,250950,seayoung13@nate.com,12436,sae_young2,17841404180027739,sae_young2,,0,0,0
261,11805,220974,oaneeeey@naver.com,11805,0_aneeey,17841441135111784,0_aneeey,luamon,38,0,0
262,11569,115017,metal_lica@naver.com,11569,opeth_choi,17841432830453327,opeth_choi,,0,0,0


In [80]:
db_merged_data['acnt_id'] = db_merged_data['acnt_id'].astype(str)

In [81]:
db_merged_data_2 = pd.merge(db_merged_data, post_efficiency_df, on='acnt_id', how='left')

In [82]:
# 여기서 post_efficiency가 없는 애들은 전부 post engagement 값을 비공개 해둬서 제외한 인플루언서들임
# gogojiyul_의 경우에는 aws 수집 리스트에 없었음
db_merged_data_2[(db_merged_data_2['total_order']!=0) & (db_merged_data_2['match_total_price']!=0)]

Unnamed: 0,uid,uid.1,user_id,member_uid,add1,acnt_id,acnt_nm,storeid,total_visit,total_order,match_total_price,avg_post_efficiency
0,5196,254254,samelee48,5196,__dongx2_,17841450980480576,__dongx2_,samelee48,452,20,1566750,
1,5546,254123,test_design,5546,_movingday,17841401293139137,_movingday,test_design,165,2,125000,12.165388
2,5616,254116,alsdud0907,5616,gogojiyul_,17841406857146322,gogojiyul_,alsdud0907,1034,72,2815900,
7,5932,254002,seojinii,5932,seojinii_,17841402936102997,seojinii_,seojinii,1352,424,24539240,0.910451
11,6682,253689,goeun,6682,goneemama,17841449913495321,goneemama,goneemama,234,10,145000,1.65625
17,6839,253631,ovvl,6839,ovvl_suji,17841401218998222,ovvl_suji,ovvl,4000,104,10769380,
18,6997,253583,ididas,6997,bong_camper83,17841453615191128,bong_camper83,ididas,450,114,6832200,
22,7979,253217,cyeone,7979,c.yeon.e,17841404564270314,c.yeon.e,honey,21231,3391,123402200,12.427208
73,10403,251847,ggubimom@hanmail.net,10403,naegyeong2,17841412336851729,naegyeong2,ggubimom,398,10,277000,4.145946
107,11243,251620,kgb20019@kakao.com,11243,s2._.ss__,17841413126192025,s2._.ss__,kgb20019,413,2,296000,3.915425


In [83]:
db_merged_data_3= db_merged_data_2[(db_merged_data_2['total_order']!=0) & (db_merged_data_2['match_total_price']!=0)].dropna()
db_merged_data_3

Unnamed: 0,uid,uid.1,user_id,member_uid,add1,acnt_id,acnt_nm,storeid,total_visit,total_order,match_total_price,avg_post_efficiency
1,5546,254123,test_design,5546,_movingday,17841401293139137,_movingday,test_design,165,2,125000,12.165388
7,5932,254002,seojinii,5932,seojinii_,17841402936102997,seojinii_,seojinii,1352,424,24539240,0.910451
11,6682,253689,goeun,6682,goneemama,17841449913495321,goneemama,goneemama,234,10,145000,1.65625
22,7979,253217,cyeone,7979,c.yeon.e,17841404564270314,c.yeon.e,honey,21231,3391,123402200,12.427208
73,10403,251847,ggubimom@hanmail.net,10403,naegyeong2,17841412336851729,naegyeong2,ggubimom,398,10,277000,4.145946
107,11243,251620,kgb20019@kakao.com,11243,s2._.ss__,17841413126192025,s2._.ss__,kgb20019,413,2,296000,3.915425


In [84]:
db_merged_data_3['ad_efficiency'] = db_merged_data_3['total_order'] / db_merged_data_3['avg_post_efficiency']
db_merged_data_3

Unnamed: 0,uid,uid.1,user_id,member_uid,add1,acnt_id,acnt_nm,storeid,total_visit,total_order,match_total_price,avg_post_efficiency,ad_efficiency
1,5546,254123,test_design,5546,_movingday,17841401293139137,_movingday,test_design,165,2,125000,12.165388,0.164401
7,5932,254002,seojinii,5932,seojinii_,17841402936102997,seojinii_,seojinii,1352,424,24539240,0.910451,465.703161
11,6682,253689,goeun,6682,goneemama,17841449913495321,goneemama,goneemama,234,10,145000,1.65625,6.037736
22,7979,253217,cyeone,7979,c.yeon.e,17841404564270314,c.yeon.e,honey,21231,3391,123402200,12.427208,272.869022
73,10403,251847,ggubimom@hanmail.net,10403,naegyeong2,17841412336851729,naegyeong2,ggubimom,398,10,277000,4.145946,2.411995
107,11243,251620,kgb20019@kakao.com,11243,s2._.ss__,17841413126192025,s2._.ss__,kgb20019,413,2,296000,3.915425,0.5108


In [85]:
ad_efficiency_df = db_merged_data_3[['acnt_id', 'ad_efficiency']]
ad_efficiency_df

Unnamed: 0,acnt_id,ad_efficiency
1,17841401293139137,0.164401
7,17841402936102997,465.703161
11,17841449913495321,6.037736
22,17841404564270314,272.869022
73,17841412336851729,2.411995
107,17841413126192025,0.5108


In [None]:
def calculate_ad_efficiency(not_conn_user_main_category_info, sales_info):
    not_conn_user_main_category_info['add1'] = not_conn_user_main_category_info['acnt_nm']
    db_merged_data = pd.merge(sales_info, not_conn_user_main_category_info, on='add1')
    db_merged_data = db_merged_data[['uid', 'user_id', 'member_uid', 'add1', 'acnt_id', 'acnt_nm', 'storeid', 'total_visit', 'total_order', 'match_total_price']]

    db_merged_data['acnt_id'] = db_merged_data['acnt_id'].astype(str)
    db_merged_data_2 = pd.merge(db_merged_data, post_efficiency_df, on='acnt_id', how='left')

    db_merged_data_3= db_merged_data_2[(db_merged_data_2['total_order']!=0) & (db_merged_data_2['match_total_price']!=0)].dropna()
    db_merged_data_3['ad_efficiency'] = db_merged_data_3['total_order'] / db_merged_data_3['avg_post_efficiency']
    
    return db_merged_data_3

##### not_connected_influencer_flexmatch_score

In [None]:
# def not_connected_user_flexmatch_score(activity_df, growth_rate_df, follower_engagement_df, follower_loyalty_df, post_efficiency_df):
#     # 크리에이터 활동성
#     creator_activity_score = activity_df[['acnt_id', 'activity_score']]
#     # 트렌드지수
#     creator_follow_growth_rate = growth_rate_df[['acnt_id', 'follow_growth_rate']]
#     # 팔로워 참여도
#     follower_engagement = follower_engagement_df[['acnt_id', 'follower_total_engagement']]
#     # 팔로워 충성도
#     follower_loyalty = follower_loyalty_df[['acnt_id', 'follower_retention_rate']]
#     # 콘텐츠 효율성
#     post_efficiency = post_efficiency_df[['acnt_id', 'avg_post_efficiency']]

#     # data_list
#     df_list = [creator_activity_score, creator_follow_growth_rate, follower_engagement, follower_loyalty, post_efficiency]

#     from functools import reduce

#     flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)
#     user_info_nm = recent_user_info_mtr[['acnt_id', 'acnt_nm', 'influencer_scale_type']]
#     flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')
#     flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'influencer_scale_type', 'activity_score', 'follow_growth_rate', 'follower_total_engagement', 'follower_retention_rate', 'avg_post_efficiency']]


#     not_connected_flexmatch_score_table = flexmatch_score.copy()
#     not_connected_flexmatch_score_table.dropna(inplace=True)
    
#     return not_connected_flexmatch_score_table

In [None]:
def not_connected_user_flexmatch_score(activity_df, growth_rate_df, follower_loyalty_df, post_efficiency_df):
    # 크리에이터 활동성
    creator_activity_score = activity_df[['acnt_id', 'activity_score']]
    # 트렌드지수
    creator_follow_growth_rate = growth_rate_df[['acnt_id', 'follow_growth_rate']]
    # 팔로워 참여도
    # follower_engagement = follower_engagement_df[['acnt_id', 'follower_total_engagement']]
    # 팔로워 충성도
    follower_loyalty = follower_loyalty_df[['acnt_id', 'follower_retention_rate']]
    # 콘텐츠 효율성
    post_efficiency = post_efficiency_df[['acnt_id', 'avg_post_efficiency']]

    # data_list
    df_list = [creator_activity_score, creator_follow_growth_rate, follower_loyalty, post_efficiency]

    from functools import reduce

    flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)
    user_info_nm = recent_user_info_mtr[['acnt_id', 'acnt_nm', 'influencer_scale_type']]
    flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')
    flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'influencer_scale_type', 'activity_score', 'follow_growth_rate', 'follower_retention_rate', 'avg_post_efficiency']]


    not_connected_flexmatch_score_table = flexmatch_score.copy()
    not_connected_flexmatch_score_table.dropna(inplace=True)
    
    return not_connected_flexmatch_score_table

In [47]:
## 광고 효율성 부분 추가

def not_connected_user_flexmatch_score(activity_df, growth_rate_df, follower_loyalty_df, post_efficiency_df, ad_efficiency_df):
    # 크리에이터 활동성
    creator_activity_score = activity_df[['acnt_id', 'activity_score']]
    # 트렌드지수
    creator_follow_growth_rate = growth_rate_df[['acnt_id', 'follow_growth_rate']]
    # 팔로워 참여도
    # follower_engagement = follower_engagement_df[['acnt_id', 'follower_total_engagement']]
    # 팔로워 충성도
    follower_loyalty = follower_loyalty_df[['acnt_id', 'follower_retention_rate']]
    # 콘텐츠 효율성
    post_efficiency = post_efficiency_df[['acnt_id', 'avg_post_efficiency']]
    # 광고효율성
    ad_efficiency = ad_efficiency_df[['acnt_id', 'ad_efficiency']]

    # data_list
    df_list = [creator_activity_score, creator_follow_growth_rate, follower_loyalty, post_efficiency, ad_efficiency]

    from functools import reduce

    flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)
    flexmatch_score['ad_efficiency'] = flexmatch_score['ad_efficiency'].fillna(0)
    
    user_info_nm = recent_user_info_mtr[['acnt_id', 'acnt_nm', 'influencer_scale_type']]
    flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')
    flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'influencer_scale_type', 'activity_score', 'follow_growth_rate', 'follower_retention_rate', 'avg_post_efficiency', 'ad_efficiency']]


    not_connected_flexmatch_score_table = flexmatch_score.copy()
    not_connected_flexmatch_score_table.dropna(inplace=True)
    
    return not_connected_flexmatch_score_table

In [86]:
not_connected_flexmatch_score_table = not_connected_user_flexmatch_score(activity_df, growth_rate_df, follower_loyalty_df, post_efficiency_df, ad_efficiency_df)
not_connected_flexmatch_score_table

Unnamed: 0,acnt_id,acnt_nm,influencer_scale_type,activity_score,follow_growth_rate,follower_retention_rate,avg_post_efficiency,ad_efficiency
0,17841400000114584,imyeonduu,macro,60.493827,-0.062194,99.94,1.060444,0.0
1,17841400001061294,chloe____min,micro,37.984496,-0.083222,99.92,0.499417,0.0
2,17841400002751973,licoco___jin,mid,326.666667,-0.013096,99.99,0.118471,0.0
3,17841400003230199,joyfully.da,micro,188.461538,0.376117,100.00,5.113817,0.0
4,17841400003646457,__me__ne,macro,116.666667,1.199362,100.00,2.920919,0.0
...,...,...,...,...,...,...,...,...
6273,17841473488240292,fatalfury_pr,mid,272.222222,2.653504,100.00,33.955602,0.0
6274,17841473857435791,sesebonbon,micro,257.894737,3.275958,100.00,11.588172,0.0
6275,17841474311083136,hany_log,nano,5.555556,0.000000,100.00,27.777778,0.0
6276,17841474495202295,susu_23.06.30,nano,46.153846,-0.855920,99.14,3.353625,0.0


In [None]:
# not_connected_flexmatch_score_table.to_csv("C:/Users/ehddl/Downloads/not_connected_flexmatch_score_table.csv")

In [87]:
seller_interest_info['ig_user_id'] = seller_interest_info['ig_user_id'].replace('', np.nan, regex=True)
conn_list = seller_interest_info[(seller_interest_info['ig_user_id'].notnull()) & (seller_interest_info['ig_user_id'] != '')]['ig_user_id'].to_list()
not_conn_user = seller_interest_info[~seller_interest_info['ig_user_id'].isin(conn_list)]
not_conn_user = not_conn_user[['add1', 'interestcategory']]

In [88]:
def clean_acnt_nm(value):
    if pd.isnull(value):
        return None
    
    value = str(value)  # 혹시 모르니 문자열로 변환
    match = re.search(r'instagram\.com/([^/]+)', value)
    if match:
        return match.group(1)
    else:
        return value.strip()
    
not_conn_user['acnt_nm'] = not_conn_user['add1'].apply(clean_acnt_nm)

In [89]:
not_conn_user

Unnamed: 0,add1,interestcategory,acnt_nm
0,13F80C4C43DC63E772650995CA18639F,,13F80C4C43DC63E772650995CA18639F
1,su.jin.1995,패션@뷰티@푸드@서비스,su.jin.1995
2,oddity,,oddity
3,https://www.instagram.com/flex.item/?hl=,,flex.item
4,https://www.instagram.com/su.jin.1995/,전체,su.jin.1995
...,...,...,...
1720,ahojihye,,ahojihye
1721,yourfit_diet,,yourfit_diet
1722,min_story1,패션@뷰티@홈/리빙@푸드@헬시,min_story1
1723,m2n_company,,m2n_company


In [90]:
not_connected_flexmatch_score_table = pd.merge(not_connected_flexmatch_score_table, not_conn_user, on='acnt_nm', how='left')
not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].fillna('뷰티')

In [91]:
not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].apply(
    lambda x: '뷰티' if pd.isna(x) or (isinstance(x, str) and x.strip() == '') else x)

In [92]:
category_map = {
            'BABY/KIDS': '베이비/키즈',
            'BEAUTY': '뷰티',
            'FASHION': '패션',
            'FOOD': '푸드',
            'HEALTHY': '헬시',
            'HOME/LIVING': '홈/리빙',
            'SERVICE': '서비스',
            'SPORT': '스포츠',
            'TEST 카테고리.. TEST': '뷰티'
        }

for k, v in category_map.items():
    not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].str.replace(k, v)

In [93]:
not_conn_user_main_category_info['acnt_id'] = not_conn_user_main_category_info['acnt_id'].astype(str)

In [94]:
not_conn_user_main_category_info

Unnamed: 0,acnt_id,acnt_nm,main_category,top_3_category,is_connected,add1
0,17841400000175995,cheol2746,유명장소/핫플,유명장소/핫플@다이어트/건강보조식품,n,cheol2746
1,17841400000253513,hari_ya,다이어트/건강보조식품,다이어트/건강보조식품@일상@엔터테인먼트,n,hari_ya
2,17841400000424290,izabellametz,일상,일상@여행/관광@패션,n,izabellametz
3,17841400000500841,jin_hwang,일상,일상@푸드@베이비/키즈,n,jin_hwang
4,17841400000504076,david.02.25,스포츠,스포츠@다이어트/건강보조식품@일상,n,david.02.25
...,...,...,...,...,...,...
37212,17841449549923448,heihwi,다이어트/건강보조식품,다이어트/건강보조식품@패션@일상,y,heihwi
37213,17841450980480576,__dongx2_,스포츠,스포츠@일상@다이어트/건강보조식품,y,__dongx2_
37214,17841453615191128,bong_camper83,홈/리빙,홈/리빙@유명장소/핫플@여행/관광,y,bong_camper83
37215,17841467094117763,hooyexni,일상,일상@여행/관광@뷰티,y,hooyexni


In [95]:
common_cols = not_connected_flexmatch_score_table.columns.intersection(not_conn_user_main_category_info.columns)
common_cols

Index(['acnt_id', 'acnt_nm', 'add1'], dtype='object')

In [96]:
not_conn_user_main_category_info.drop(['acnt_nm', 'add1'], axis=1, inplace=True)

In [97]:
not_connected_flexmatch_score_tables = pd.merge(not_connected_flexmatch_score_table, not_conn_user_main_category_info, on='acnt_id', how='inner')

In [98]:
not_connected_flexmatch_score_tables

Unnamed: 0,acnt_id,acnt_nm,influencer_scale_type,activity_score,follow_growth_rate,follower_retention_rate,avg_post_efficiency,ad_efficiency,add1,interestcategory,main_category,top_3_category,is_connected
0,17841400001061294,chloe____min,micro,37.984496,-0.083222,99.92,0.499417,0.0,,뷰티,베이비/키즈,베이비/키즈,n
1,17841400002751973,licoco___jin,mid,326.666667,-0.013096,99.99,0.118471,0.0,https://instagram.com/licoco___jin,뷰티,베이비/키즈,베이비/키즈@뷰티@푸드,n
2,17841400003230199,joyfully.da,micro,188.461538,0.376117,100.00,5.113817,0.0,,뷰티,여행/관광,여행/관광@유명장소/핫플@베이비/키즈,n
3,17841400003646457,__me__ne,macro,116.666667,1.199362,100.00,2.920919,0.0,,뷰티,여행/관광,여행/관광@패션@셀럽,n
4,17841400005463628,therock,mega,85.964912,-0.017945,99.98,0.089340,0.0,,뷰티,셀럽,셀럽@엔터테인먼트@여행/관광,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4919,17841473488240292,fatalfury_pr,mid,272.222222,2.653504,100.00,33.955602,0.0,,뷰티,여행/관광,여행/관광@엔터테인먼트@스포츠,n
4920,17841473857435791,sesebonbon,micro,257.894737,3.275958,100.00,11.588172,0.0,sesebonbon,뷰티,뷰티,뷰티@유명장소/핫플@사진/영상,n
4921,17841474311083136,hany_log,nano,5.555556,0.000000,100.00,27.777778,0.0,,뷰티,일상,일상,n
4922,17841474495202295,susu_23.06.30,nano,46.153846,-0.855920,99.14,3.353625,0.0,susu_23.06.30,뷰티,반려동물,반려동물,n


In [99]:
not_connected_flexmatch_score_tables = not_connected_flexmatch_score_tables.drop_duplicates(subset=['acnt_id', 'acnt_nm'])

In [100]:
not_connected_flexmatch_score_tables['influencer_scale_type'].value_counts()

influencer_scale_type
mid      1793
micro    1440
macro     843
mega      413
nano      411
Name: count, dtype: int64

In [101]:
not_connected_flexmatch_score_tables

Unnamed: 0,acnt_id,acnt_nm,influencer_scale_type,activity_score,follow_growth_rate,follower_retention_rate,avg_post_efficiency,ad_efficiency,add1,interestcategory,main_category,top_3_category,is_connected
0,17841400001061294,chloe____min,micro,37.984496,-0.083222,99.92,0.499417,0.0,,뷰티,베이비/키즈,베이비/키즈,n
1,17841400002751973,licoco___jin,mid,326.666667,-0.013096,99.99,0.118471,0.0,https://instagram.com/licoco___jin,뷰티,베이비/키즈,베이비/키즈@뷰티@푸드,n
2,17841400003230199,joyfully.da,micro,188.461538,0.376117,100.00,5.113817,0.0,,뷰티,여행/관광,여행/관광@유명장소/핫플@베이비/키즈,n
3,17841400003646457,__me__ne,macro,116.666667,1.199362,100.00,2.920919,0.0,,뷰티,여행/관광,여행/관광@패션@셀럽,n
4,17841400005463628,therock,mega,85.964912,-0.017945,99.98,0.089340,0.0,,뷰티,셀럽,셀럽@엔터테인먼트@여행/관광,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4919,17841473488240292,fatalfury_pr,mid,272.222222,2.653504,100.00,33.955602,0.0,,뷰티,여행/관광,여행/관광@엔터테인먼트@스포츠,n
4920,17841473857435791,sesebonbon,micro,257.894737,3.275958,100.00,11.588172,0.0,sesebonbon,뷰티,뷰티,뷰티@유명장소/핫플@사진/영상,n
4921,17841474311083136,hany_log,nano,5.555556,0.000000,100.00,27.777778,0.0,,뷰티,일상,일상,n
4922,17841474495202295,susu_23.06.30,nano,46.153846,-0.855920,99.14,3.353625,0.0,susu_23.06.30,뷰티,반려동물,반려동물,n


In [53]:
# not_connected_flexmatch_score_table[(not_connected_flexmatch_score_table['influencer_scale_type'] == 'macro') & (not_connected_flexmatch_score_table['interestcategory']=='뷰티')]

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# import pandas as pd
# import numpy as np

# def normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list, reverse_columns=None, feature_range=(0, 5)):

#     if reverse_columns is None:
#         reverse_columns = []

#     normalized_df_dict = {}

#     for name, df in zip(influencer_scale_names, influencer_scale_df_list):
#         cleaned = df.copy()

#         # 무한대 및 NaN 제거
#         float_cols = cleaned.select_dtypes(include='float64').columns
#         cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
#         cleaned = cleaned.dropna(subset=float_cols)

#         if cleaned.empty:
#             continue

#         norm_df = pd.DataFrame(index=cleaned.index)
#         for col in float_cols:
#             scaler = MinMaxScaler(feature_range=feature_range)
#             norm_col = scaler.fit_transform(cleaned[[col]])
#             if col in reverse_columns:
#                 norm_df[col] = feature_range[1] - norm_col.ravel()
#             else:
#                 norm_df[col] = norm_col.ravel()

#         # ID 및 이름, 스케일 타입 추가
#         norm_df['acnt_id'] = cleaned['acnt_id'].values
#         norm_df['acnt_nm'] = cleaned['acnt_nm'].values
#         norm_df['influencer_scale_type'] = name

#         normalized_df_dict[name] = norm_df

#     normalized_all_df = pd.concat(normalized_df_dict.values(), ignore_index=True)
#     normalized_all_dic = normalized_all_df.to_dict(orient='index')

#     return normalized_all_df, normalized_all_dic


In [None]:
# def normalize_influencer_scores(
#     influencer_scale_names, 
#     influencer_scale_df_list, 
#     reverse_columns=None, 
#     log_columns=None, 
#     feature_range=(0, 5)
# ):
#     if reverse_columns is None:
#         reverse_columns = []
#     if log_columns is None:
#         log_columns = []

#     normalized_df_dict = {}

#     for name, df in zip(influencer_scale_names, influencer_scale_df_list):
#         cleaned = df.copy()

#         # 무한대 및 NaN 제거
#         float_cols = cleaned.select_dtypes(include='float64').columns
#         cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
#         cleaned = cleaned.dropna(subset=float_cols)

#         if cleaned.empty:
#             continue

#         norm_df = pd.DataFrame(index=cleaned.index)
#         for col in float_cols:
#             col_data = cleaned[col]
#             if col in log_columns:
#                 col_data = np.log1p(col_data)
#             scaler = MinMaxScaler(feature_range=feature_range)
#             norm_col = scaler.fit_transform(col_data.values.reshape(-1, 1))
#             if col in reverse_columns:
#                 norm_df[col] = feature_range[1] - norm_col.ravel()
#             else:
#                 norm_df[col] = norm_col.ravel()

#         norm_df['acnt_id'] = cleaned['acnt_id'].values
#         norm_df['acnt_nm'] = cleaned['acnt_nm'].values
#         norm_df['influencer_scale_type'] = name

#         normalized_df_dict[name] = norm_df

#     normalized_all_df = pd.concat(normalized_df_dict.values(), ignore_index=True)
#     normalized_all_dic = normalized_all_df.to_dict(orient='index')

#     return normalized_all_df, normalized_all_dic


In [102]:
nano = not_connected_flexmatch_score_tables[not_connected_flexmatch_score_tables['influencer_scale_type']=='nano']
micro = not_connected_flexmatch_score_tables[not_connected_flexmatch_score_tables['influencer_scale_type']=='micro']
mid = not_connected_flexmatch_score_tables[not_connected_flexmatch_score_tables['influencer_scale_type']=='mid']
macro = not_connected_flexmatch_score_tables[not_connected_flexmatch_score_tables['influencer_scale_type']=='macro']
mega = not_connected_flexmatch_score_tables[not_connected_flexmatch_score_tables['influencer_scale_type']=='mega']

In [103]:
# def normalize_influencer_scores(
#     influencer_scale_names, 
#     influencer_scale_df_list, 
#     reverse_columns=None, 
#     log_columns=None, 
#     feature_range=(1, 5)
# ):
#     if reverse_columns is None:
#         reverse_columns = []
#     if log_columns is None:
#         log_columns = []

#     normalized_df_list = []

#     for name, df in zip(influencer_scale_names, influencer_scale_df_list):
#         cleaned = df.copy()

#         # 무한대 및 NaN 제거
#         float_cols = cleaned.select_dtypes(include='float64').columns
#         cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
#         cleaned = cleaned.dropna(subset=float_cols)

#         if cleaned.empty:
#             continue

#         # 관심 카테고리 첫 번째 값 추출
#         if 'interestcategory' in cleaned.columns:
#             cleaned['main_interest_category'] = cleaned['interestcategory'].apply(
#                 lambda x: str(x).split('@')[0] if pd.notnull(x) else '뷰티'
#             )
#         else:
#             cleaned['main_interest_category'] = '뷰티'

#         # (관심카테고리, 스케일타입) 조합별 그룹화
#         cleaned['influencer_scale_type'] = name  # 확실히 넣어두기
#         grouped = cleaned.groupby(['main_interest_category', 'influencer_scale_type'])

#         for (category, scale_type), group in grouped:
#             norm_df = pd.DataFrame(index=group.index)

#             # 그룹 내 인원 수 체크
#             if len(group) == 1:
#                 # 1명인 경우 모든 점수를 1.0으로 고정
#                 for col in float_cols:
#                     norm_df.loc[group.index, col] = 3.0
#             else:
#                 # 여러명인 경우 정규화 진행
#                 for col in float_cols:
#                     col_data = group[col]
#                     if col in log_columns:
#                         col_data = np.log1p(col_data)

#                     max_val = col_data.max()
#                     min_val = col_data.min()

#                     if max_val == min_val:
#                         # 모든 값이 동일하면 1.0 고정
#                         norm_df.loc[group.index, col] = 3.0
#                     else:
#                         scaler = MinMaxScaler(feature_range=feature_range)
#                         norm_col = scaler.fit_transform(col_data.values.reshape(-1, 1))
#                         if col in reverse_columns:
#                             norm_df[col] = feature_range[1] - norm_col.ravel()
#                         else:
#                             norm_df[col] = norm_col.ravel()

#             # 공통 컬럼 복사
#             norm_df['acnt_id'] = group['acnt_id'].values
#             norm_df['acnt_nm'] = group['acnt_nm'].values
#             norm_df['influencer_scale_type'] = scale_type
#             norm_df['main_interest_category'] = category

#             normalized_df_list.append(norm_df)

#     normalized_all_df = pd.concat(normalized_df_list, ignore_index=True)
#     normalized_all_dic = normalized_all_df.to_dict(orient='index')

#     return normalized_all_df, normalized_all_dic

def normalize_influencer_scores(
    influencer_scale_names, 
    influencer_scale_df_list, 
    reverse_columns=None, 
    log_columns=None, 
    feature_range=(1, 5)
):
    if reverse_columns is None:
        reverse_columns = []
    if log_columns is None:
        log_columns = []

    normalized_df_list = []

    for name, df in zip(influencer_scale_names, influencer_scale_df_list):
        cleaned = df.copy()

        # 무한대 및 NaN 제거
        float_cols = cleaned.select_dtypes(include='float64').columns
        cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
        cleaned = cleaned.dropna(subset=float_cols)

        if cleaned.empty:
            continue

        # 관심 카테고리 첫 번째 값 추출
        if 'interestcategory' in cleaned.columns:
            cleaned['main_interest_category'] = cleaned['interestcategory'].apply(
                lambda x: str(x).split('@')[0] if pd.notnull(x) else '뷰티'
            )
        else:
            cleaned['main_interest_category'] = '뷰티'

        # (관심카테고리, 스케일타입) 조합별 그룹화
        cleaned['influencer_scale_type'] = name  # 확실히 넣어두기
        grouped = cleaned.groupby(['main_category', 'influencer_scale_type'])

        for (category, scale_type), group in grouped:
            norm_df = pd.DataFrame(index=group.index)

            # 그룹 내 인원 수 체크
            if len(group) == 1:
                # 1명인 경우 모든 점수를 1.0으로 고정
                for col in float_cols:
                    norm_df.loc[group.index, col] = 3.0
            else:
                # 여러명인 경우 정규화 진행
                for col in float_cols:
                    col_data = group[col]
                    if col in log_columns:
                        col_data = np.log1p(col_data)

                    max_val = col_data.max()
                    min_val = col_data.min()

                    if max_val == min_val:
                        # 모든 값이 동일하면 1.0 고정
                        norm_df.loc[group.index, col] = 3.0
                    else:
                        scaler = MinMaxScaler(feature_range=feature_range)
                        norm_col = scaler.fit_transform(col_data.values.reshape(-1, 1))
                        if col in reverse_columns:
                            norm_df[col] = feature_range[1] - norm_col.ravel()
                        else:
                            norm_df[col] = norm_col.ravel()

            # 공통 컬럼 복사
            norm_df['acnt_id'] = group['acnt_id'].values
            norm_df['acnt_nm'] = group['acnt_nm'].values
            norm_df['influencer_scale_type'] = scale_type
            norm_df['main_interest_category'] = group['main_interest_category'].values
            norm_df['main_category'] = category
            norm_df['top_3_category'] = group['top_3_category'].values
            

            normalized_df_list.append(norm_df)

    normalized_all_df = pd.concat(normalized_df_list, ignore_index=True)
    normalized_all_dic = normalized_all_df.to_dict(orient='index')

    return normalized_all_df, normalized_all_dic


In [104]:
influencer_scale_names=['nano', 'micro', 'mid', 'macro', 'mega']
influencer_scale_df_list=[nano, micro, mid, macro, mega]

normalized_df, normalized_all_dic = normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list, reverse_columns='activity_score')

In [105]:
normalized_all_dic

{0: {'activity_score': 0.0,
  'follow_growth_rate': 3.0,
  'follower_retention_rate': 3.0,
  'avg_post_efficiency': 5.0,
  'ad_efficiency': 3.0,
  'acnt_id': '17841401530775838',
  'acnt_nm': 'touritst_genie',
  'influencer_scale_type': 'nano',
  'main_interest_category': '뷰티',
  'main_category': 'IT',
  'top_3_category': 'IT'},
 1: {'activity_score': 4.0,
  'follow_growth_rate': 3.0,
  'follower_retention_rate': 3.0,
  'avg_post_efficiency': 1.0,
  'ad_efficiency': 3.0,
  'acnt_id': '17841466432398761',
  'acnt_nm': 'robopia7',
  'influencer_scale_type': 'nano',
  'main_interest_category': '뷰티',
  'main_category': 'IT',
  'top_3_category': 'IT'},
 2: {'activity_score': 3.0,
  'follow_growth_rate': 3.0,
  'follower_retention_rate': 3.0,
  'avg_post_efficiency': 3.0,
  'ad_efficiency': 3.0,
  'acnt_id': '17841454051871300',
  'acnt_nm': 'sunny910336',
  'influencer_scale_type': 'nano',
  'main_interest_category': '뷰티',
  'main_category': '결혼/연애',
  'top_3_category': '결혼/연애@유명장소/핫플'},
 3

In [106]:
len(list(normalized_all_dic.keys()))

4900

In [107]:
ssh.insert_query_with_lookup('op_mem_seller_score', list(normalized_all_dic.values()))

INSERT INTO op_mem_seller_score (activity_score, follow_growth_rate, follower_retention_rate, avg_post_efficiency, ad_efficiency, acnt_id, acnt_nm, influencer_scale_type, main_interest_category, main_category, top_3_category, member_uid, user_id, is_connected) VALUES (%(activity_score)s, %(follow_growth_rate)s, %(follower_retention_rate)s, %(avg_post_efficiency)s, %(ad_efficiency)s, %(acnt_id)s, %(acnt_nm)s, %(influencer_scale_type)s, %(main_interest_category)s, %(main_category)s, %(top_3_category)s, %(member_uid)s, %(user_id)s, %(is_connected)s)
inserted acnt_id: 17841401530775838
INSERT INTO op_mem_seller_score (activity_score, follow_growth_rate, follower_retention_rate, avg_post_efficiency, ad_efficiency, acnt_id, acnt_nm, influencer_scale_type, main_interest_category, main_category, top_3_category, member_uid, user_id, is_connected) VALUES (%(activity_score)s, %(follow_growth_rate)s, %(follower_retention_rate)s, %(avg_post_efficiency)s, %(ad_efficiency)s, %(acnt_id)s, %(acnt_nm)s,

2025-08-21 04:00:29,534| ERROR   | Socket exception: 현재 연결은 원격 호스트에 의해 강제로 끊겼습니다 (10054)


module_test

In [2]:
import os
os.chdir("C:/Users/ehddl/Desktop/업무/code/Flexmatch_score/")

In [3]:
from modules.DB_connection_and_Load_not_conn_S3_data import *
from modules.data_preprocessing import *
from modules.not_connected_user_calcuate_flexmatch_score import *


load_dotenv()
aws_access_key = os.getenv("aws_accessKey")
aws_secret_key = os.getenv("aws_secretKey")

client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name='ap-northeast-2'
)

def main():
    ## DB data loading
    sales_info, seller_interest_info, not_conn_user_main_category_info = get_all_infos()

    ## s3 data loading
    bucket_name = 'flexmatch-data'
    # table_list = ['RECENT_USER_INFO_MTR', 'TIME_SERIES_PROFILE_INFO', 'BY_USER_ID_MEDIA_DTL_INFO', 'BY_DATE_MEDIA_AGG_INFO']
    table_list = ['EXTERNAL_RECENT_USER_INFO_MTR', 'EXTERNAL_TIME_SERIES_PROFILE_INFO', 'EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO', 'EXTERNAL_BY_DATE_MEDIA_AGG_INFO']

    # connected_user & not_connected_user common table
    merged_data_by_table = load_weekly_instagram_data(bucket_name, table_list, weeks_back=2, target_filename='merged_data.parquet')
    
    # recent_user_info_mtr = merged_data_by_table['RECENT_USER_INFO_MTR']['prev_week']
    # time_series_profile_info = merged_data_by_table['TIME_SERIES_PROFILE_INFO']['prev_week']
    # by_user_id_media_dtl_info = merged_data_by_table['BY_USER_ID_MEDIA_DTL_INFO']['prev_week']
    # by_date_media_agg_info = merged_data_by_table['BY_DATE_MEDIA_AGG_INFO']['prev_week']

    # recent_user_info_mtr_2 = merged_data_by_table['RECENT_USER_INFO_MTR']['current_week']
    # time_series_profile_info_2 = merged_data_by_table['TIME_SERIES_PROFILE_INFO']['current_week']
    # by_user_id_media_dtl_info_2 = merged_data_by_table['BY_USER_ID_MEDIA_DTL_INFO']['current_week']
    # by_date_media_agg_info_2 = merged_data_by_table['BY_DATE_MEDIA_AGG_INFO']['current_week']

    recent_user_info_mtr = merged_data_by_table['EXTERNAL_RECENT_USER_INFO_MTR']['prev_week']
    time_series_profile_info = merged_data_by_table['EXTERNAL_TIME_SERIES_PROFILE_INFO']['prev_week']
    by_user_id_media_dtl_info = merged_data_by_table['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']['prev_week']
    by_date_media_agg_info = merged_data_by_table['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']['prev_week']

    recent_user_info_mtr_2 = merged_data_by_table['EXTERNAL_RECENT_USER_INFO_MTR']['current_week']
    time_series_profile_info_2 = merged_data_by_table['EXTERNAL_TIME_SERIES_PROFILE_INFO']['current_week']
    by_user_id_media_dtl_info_2 = merged_data_by_table['EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO']['current_week']
    by_date_media_agg_info_2 = merged_data_by_table['EXTERNAL_BY_DATE_MEDIA_AGG_INFO']['current_week']


    # 혹시 몰라서 일단 한번 적용
    recent_user_info_mtr['acnt_id'] = recent_user_info_mtr['acnt_id'].astype(str)
    time_series_profile_info['acnt_id'] = time_series_profile_info['acnt_id'].astype(str)
    recent_user_info_mtr_2['acnt_id'] = recent_user_info_mtr_2['acnt_id'].astype(str)
    time_series_profile_info_2['acnt_id'] = time_series_profile_info_2['acnt_id'].astype(str)

    by_user_id_media_dtl_info_2['acnt_id'] = by_user_id_media_dtl_info_2['acnt_id'].astype(str)
    by_date_media_agg_info_2['media_id'] = by_date_media_agg_info_2['media_id'].astype(str)

    ## Data preprocessing
    # -------- not_connected_user data -------
    
    # unique_user = recent_user_info_mtr['acnt_id'].unique()
    nc_unique_user = recent_user_info_mtr_2[recent_user_info_mtr_2['acnt_conn_yn']=='N']['acnt_id'].to_list()
    
    nc_recent_user_info_mtr_2 = recent_user_info_mtr_2[recent_user_info_mtr_2['acnt_id'].isin(nc_unique_user)]
    print(nc_recent_user_info_mtr_2['acnt_id'].nunique())

    nc_time_series_profile_info = time_series_profile_info[time_series_profile_info['acnt_id'].isin(nc_unique_user)]
    nc_time_series_profile_info_2 = time_series_profile_info_2[time_series_profile_info_2['acnt_id'].isin(nc_unique_user)]
    
    # nc_by_user_id_media_dtl_info = by_user_id_media_dtl_info[by_user_id_media_dtl_info['acnt_id'].isin(nc_unique_user)]
    nc_by_user_id_media_dtl_info_2 = by_user_id_media_dtl_info_2[by_user_id_media_dtl_info_2['acnt_id'].isin(nc_unique_user)]

    nc_unique_media = nc_by_user_id_media_dtl_info_2['media_id'].unique()
    nc_by_date_media_agg_info_2 = by_date_media_agg_info_2[by_date_media_agg_info_2['media_id'].isin(nc_unique_media)]

    # influencer scale type
    # nc_recent_user_info_mtr.loc[:, 'influencer_scale_type'] = nc_recent_user_info_mtr.apply(influencer_scale_type, axis=1)
    nc_recent_user_info_mtr_2.loc[:,'influencer_scale_type'] = nc_recent_user_info_mtr_2.apply(influencer_scale_type, axis=1)

    nc_user_info, nc_timeseries, nc_timeseries_2, nc_media_info, nc_media_agg, nc_all_merged_df, nc_media_engagement_merged_df, nc_media_engagement_profile_merged_df, nc_time_series_merged_df = create_merged_df(
                                                                                                                                                                            nc_recent_user_info_mtr_2,
                                                                                                                                                                            nc_time_series_profile_info,
                                                                                                                                                                            nc_time_series_profile_info_2,
                                                                                                                                                                            nc_by_user_id_media_dtl_info_2,
                                                                                                                                                                            nc_by_date_media_agg_info_2)
    
   
    ## calculate flexmatch score - non_connected_user
    activity_df = calculate_activity_score(nc_media_info)
    check_inf(activity_df)

    growth_rate_df = calculate_follower_growth_rate(nc_timeseries, nc_timeseries_2)

    # follower_engagment_df = calculate_follower_engagement(nc_media_engagement_profile_merged_df)
    # check_inf(follower_engagment_df)

    follower_loyalty_df = calculate_follower_loyalty(nc_time_series_merged_df)
    check_inf(follower_loyalty_df)

    post_efficiency_df = calculate_post_efficiency_df(nc_media_engagement_profile_merged_df)
    check_inf(post_efficiency_df)

    ## create flexmatch score table by influencer scale type
    not_connected_flexmatch_score_table = not_connected_user_flexmatch_score(nc_user_info, activity_df, growth_rate_df, follower_loyalty_df, post_efficiency_df)
    
    conn_list = seller_interest_info[(seller_interest_info['ig_user_id'].notnull()) & (seller_interest_info['ig_user_id'] != '')]['ig_user_id'].to_list()
    not_conn_user = seller_interest_info[~seller_interest_info['ig_user_id'].isin(conn_list)]
    not_conn_user = not_conn_user[['add1', 'interestcategory']]

    not_conn_user['acnt_nm'] = not_conn_user['add1'].apply(clean_acnt_nm)

    # # score table에 interest category merge
    not_connected_flexmatch_score_table = pd.merge(not_connected_flexmatch_score_table, not_conn_user, on='acnt_nm')
    not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].fillna('뷰티')
    not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].apply(
            lambda x: '뷰티' if pd.isna(x) or (isinstance(x, str) and x.strip() == '') else x)
    
    category_map = {
            'BABY/KIDS': '베이비/키즈',
            'BEAUTY': '뷰티',
            'FASHION': '패션',
            'FOOD': '푸드',
            'HEALTHY': '헬시',
            'HOME/LIVING': '홈/리빙',
            'SERVICE': '서비스',
            'SPORT': '스포츠',
            'TEST 카테고리.. TEST': '뷰티'
        }

    for k, v in category_map.items():
        not_connected_flexmatch_score_table['interestcategory'] = not_connected_flexmatch_score_table['interestcategory'].str.replace(k, v)

    # score table에 main category merge
    not_conn_user_main_category_info = not_conn_user_main_category_info[~not_conn_user_main_category_info['acnt_id'].isin(conn_list)]
    not_conn_user_main_category_info = not_conn_user_main_category_info[['acnt_id', 'main_category', 'top_3_category']]
    not_conn_user_main_category_info['acnt_id'] = not_conn_user_main_category_info['acnt_id'].astype(str)

    not_connected_flexmatch_score_table = pd.merge(not_connected_flexmatch_score_table, not_conn_user_main_category_info, on='acnt_id') 
    print(not_conn_user_main_category_info.info())
    # final preprocessing after table merge
    not_connected_flexmatch_score_table = not_connected_flexmatch_score_table.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
    
    nc_nano = not_connected_flexmatch_score_table[not_connected_flexmatch_score_table['influencer_scale_type']=='nano']
    nc_micro = not_connected_flexmatch_score_table[not_connected_flexmatch_score_table['influencer_scale_type']=='micro']
    nc_mid = not_connected_flexmatch_score_table[not_connected_flexmatch_score_table['influencer_scale_type']=='mid']
    nc_macro = not_connected_flexmatch_score_table[not_connected_flexmatch_score_table['influencer_scale_type']=='macro']
    nc_mega = not_connected_flexmatch_score_table[not_connected_flexmatch_score_table['influencer_scale_type']=='mega']

    # connected_user 추가
    influencer_scale_names=['nano', 'micro', 'mid', 'macro', 'mega']
    influencer_scale_df_list=[nc_nano, nc_micro, nc_mid, nc_macro, nc_mega] # 여기에 connected user도 같이 포함하면 한번에 업로드 되지 않을까 함

    normalized_df, normalized_all_dic = normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list)
    print(normalized_all_dic)

if __name__=='__main__':
    main()

DB 접속 성공
DB 접속 성공
DB 접속 성공
[Success] Loaded instagram-data/tables/EXTERNAL_RECENT_USER_INFO_MTR/year=2025/week=33/merged_data.parquet for table EXTERNAL_RECENT_USER_INFO_MTR, week 33
[Success] Loaded instagram-data/tables/EXTERNAL_TIME_SERIES_PROFILE_INFO/year=2025/week=33/merged_data.parquet for table EXTERNAL_TIME_SERIES_PROFILE_INFO, week 33
[Success] Loaded instagram-data/tables/EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO/year=2025/week=33/merged_data.parquet for table EXTERNAL_BY_USER_ID_MEDIA_DTL_INFO, week 33
[Success] Loaded instagram-data/tables/EXTERNAL_BY_DATE_MEDIA_AGG_INFO/year=2025/week=33/merged_data.parquet for table EXTERNAL_BY_DATE_MEDIA_AGG_INFO, week 33
[Success] Loaded instagram-data/tables/EXTERNAL_RECENT_USER_INFO_MTR/year=2025/week=32/merged_data.parquet for table EXTERNAL_RECENT_USER_INFO_MTR, week 32
[Success] Loaded instagram-data/tables/EXTERNAL_TIME_SERIES_PROFILE_INFO/year=2025/week=32/merged_data.parquet for table EXTERNAL_TIME_SERIES_PROFILE_INFO, week 32
[Succes

##### 크리에이터 광고 효율성 

In [None]:
revenue_dic = {
    'acnt_nm' : ['s_h_j_', 'siwolbubu_hyun', 'bong_camper83', 'binwoos', 'seojinii_', 'tingkerhee'],
    'sell_type' : ['flexmatch', 'other', 'flexmatch', 'flexmatch', 'other', 'other'],
    'total_revenue' : [6906000, 10937105, 7233100, 8759000, 7939664, 38449720],
    'total_order_cnt' : [84, 132, 122, 88, 89, 471]
    }

revenue_df = pd.DataFrame(revenue_dic)
revenue_merged_df = pd.merge(media_engagement_merged_df, revenue_df, on='acnt_nm', how='left')
revenue_merged_df = revenue_merged_df[['acnt_id', 'acnt_nm', 'follower_cnt', 'follow_cnt', 'media_cnt', 'sell_type', 'total_revenue', 'total_order_cnt', 'media_id', 'like_cnt', 'cmnt_cnt']]
revenue_merged_df

revenue_merged_df['post_efficiency'] = ((revenue_merged_df['like_cnt'] + revenue_merged_df['cmnt_cnt']) / revenue_merged_df['follower_cnt']) * 100
revenue_df_total = revenue_merged_df.groupby(['acnt_id', 'acnt_nm']).agg({
    'post_efficiency' : 'mean',
    'total_order_cnt' : 'first'
}).dropna()

revenue_df_total['advertisement_efficiency'] = (revenue_df_total['total_order_cnt'] / revenue_df_total['post_efficiency']) 
revenue_df_total.sort_values(by='advertisement_efficiency', ascending=False)

revenue_merged_df['engagement_per_post'] = ((revenue_merged_df['like_cnt'] + revenue_merged_df['cmnt_cnt']) / 25)
revenue_df_total = revenue_merged_df.groupby(['acnt_id', 'acnt_nm']).agg({
    'engagement_per_post' : 'mean',
    'total_revenue' : 'first',
    'total_order_cnt' : 'first'
}).dropna()

# 팔로워를 생각하지 않고, 25개의 콘텐츠 단위당 반응 비율을 계산
revenue_df_total['advertisement_efficiency'] = (revenue_df_total['total_order_cnt'] / revenue_df_total['engagement_per_post'] * 25) * 100
revenue_df_total.sort_values(by='advertisement_efficiency', ascending=False)

Unnamed: 0,acnt_nm,sell_type,total_revenue,total_order_cnt
0,s_h_j_,flexmatch,6906000,84
1,siwolbubu_hyun,other,10937105,132
2,bong_camper83,flexmatch,7233100,122
3,binwoos,flexmatch,8759000,88
4,seojinii_,other,7939664,89
5,tingkerhee,other,38449720,471


콘텐츠 효율성이 아닌 팔로워 참여도를 기준으로 계산했을 때

In [None]:
engaged_df_2 = engaged_df.copy()
revenue_merged_df_2 = revenue_merged_df.groupby(['acnt_id', 'acnt_nm'])[['total_revenue', 'total_order_cnt']].first().dropna().reset_index()
revenue_df_total_2 = pd.merge(engaged_df_2, revenue_merged_df_2, on='acnt_id')

revenue_df_total_2['advertisement_efficiency'] = revenue_df_total_2['total_order_cnt'] / revenue_df_total_2['follower_total_engagement']
revenue_df_total_2[['acnt_id', 'acnt_nm', 'total_revenue', 'total_order_cnt', 'follower_total_engagement', 'advertisement_efficiency']].sort_values(by='advertisement_efficiency', ascending=False)

table merged

In [None]:
# 크리에이터 활동성
creator_activity_score = activity_df[['acnt_id', 'avg_upload_interval']]

In [None]:
# 트렌드지수
creator_follow_growth_rate = time_series_merged_df[['acnt_id', 'follow_growth_rate']]

In [None]:
# 팔로워 참여도
follower_engagement = engaged_df[['acnt_id', 'follower_total_engagement']]

In [None]:
# 팔로워 충성도
follower_loyalty = time_series_merged_df_copy[['acnt_id', 'follower_retention_rate']]

In [None]:
# 콘텐츠 효율성
post_efficiency = post_efficiency_df.copy()

In [None]:
# 광고효율성
revenue_df_total = revenue_df_total.reset_index()
advertisement_efficiency = revenue_df_total[['acnt_id', 'advertisement_efficiency']]

In [None]:
df_list = [creator_activity_score, creator_follow_growth_rate, follower_engagement, follower_loyalty, post_efficiency, advertisement_efficiency]

In [None]:
from functools import reduce

flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)

In [None]:
user_info_nm = user_info_2[['acnt_id', 'acnt_nm']]
flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')

In [None]:
flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'avg_upload_interval', 'follow_growth_rate', 'follower_total_engagement', 'follower_retention_rate', 'avg_post_efficiency', 'advertisement_efficiency']]
flexmatch_score.rename(columns={
    'avg_upload_interval' : '크리에이터 활동성',
    'follow_growth_rate' : '트렌드지수(팔로워순증가량)', 
    'follower_total_engagement' : '콘텐츠 참여도' ,
    'follower_retention_rate' : '팔로워 충성도',
    'avg_post_efficiency' : '콘텐츠 효율성',
    'advertisement_efficiency' : '광고 효율성'
}, inplace=True)

In [None]:
flexmatch_score_2 = flexmatch_score.copy()
flexmatch_score_2.dropna(inplace=True)

In [None]:
flexmatch_score_2

Unnamed: 0,acnt_id,acnt_nm,크리에이터 활동성,트렌드지수(팔로워순증가량),콘텐츠 참여도,팔로워 충성도,콘텐츠 효율성,광고 효율성
0,17841400361359004,s_h_j_,1.166667,0.057866,110.188904,100.0,0.045178,1859.316062
1,17841400561503844,binwoos,0.5,-0.014956,626.6426,99.99,0.16973,518.470975
2,17841400591698216,tingkerhee,2.416667,-0.018509,2065.579146,99.98,0.483403,974.343202
4,17841401506106699,siwolbubu_hyun,0.958333,-0.075045,2094.996055,99.92,0.556736,237.096389
6,17841402936102997,seojinii_,2.875,-0.002312,166.809026,100.0,0.240359,370.279723
10,17841453615191128,bong_camper83,1.5,0.281793,2526.57868,100.0,2.506526,48.672935


In [None]:
# from sklearn.preprocessing import MinMaxScaler

# columns = flexmatch_score_2.select_dtypes(include='float64').columns
# minmax = MinMaxScaler(feature_range=(0, 5))
# flexmatch_score_norm = minmax.fit_transform(flexmatch_score_2[columns])

# flexmatch_score_norm_df = pd.DataFrame(flexmatch_score_norm, columns=columns, index=flexmatch_score_2.index)
# flexmatch_score_norm_df['acnt_id'] = flexmatch_score_2['acnt_id']
# flexmatch_score_norm_df['acnt_nm'] = flexmatch_score_2['acnt_nm']

# flexmatch_score_norm_df['avg_upload_interval'] = 5 - flexmatch_score_norm_df['avg_upload_interval']
# flexmatch_score_norm_df


In [None]:
from sklearn.preprocessing import MinMaxScaler

columns = flexmatch_score_2.select_dtypes(include='float64').columns
minmax = MinMaxScaler(feature_range=(0, 5))
flexmatch_score_norm = minmax.fit_transform(flexmatch_score_2[columns])

flexmatch_score_norm_df = pd.DataFrame(flexmatch_score_norm, columns=columns, index=flexmatch_score_2.index)
flexmatch_score_norm_df['acnt_id'] = flexmatch_score_2['acnt_id']
flexmatch_score_norm_df['acnt_nm'] = flexmatch_score_2['acnt_nm']

flexmatch_score_norm_df['크리에이터 활동성'] = 5 - flexmatch_score_norm_df['크리에이터 활동성']
flexmatch_score_norm_df

# 지금 이거 정규화 할 때는 그냥 이사람들을 전부 다 합쳐서 함께 정규화를 진행해서 그런데, 원래라면은 팔로워를 기준으로 뭐 메가 인플루언서, 마이크로 인플루언서 이런식으로 기준을 나눠서 정규화를 해야 조금 더 정확한 값이 될 것 같음.


Unnamed: 0,크리에이터 활동성,트렌드지수(팔로워순증가량),콘텐츠 참여도,팔로워 충성도,콘텐츠 효율성,광고 효율성,acnt_id,acnt_nm
0,3.596491,1.862339,0.0,5.0,0.0,5.0,17841400361359004,s_h_j_
1,5.0,0.84196,1.068647,4.375,0.253016,1.297324,17841400561503844,binwoos
2,0.964912,0.792182,4.046099,3.75,0.890213,2.556192,17841400591698216,tingkerhee
4,4.035088,0.0,4.106968,0.0,1.039182,0.520322,17841401506106699,siwolbubu_hyun
6,0.0,1.019133,0.117159,5.0,0.396492,0.888101,17841402936102997,seojinii_
10,2.894737,5.0,5.0,5.0,5.0,0.0,17841453615191128,bong_camper83


In [None]:
# flexmatch_score_norm_df.to_csv("flexmatch_score_test.csv")