In [27]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler

import os
from dotenv import load_dotenv
import boto3
import io
import pymysql
import json

DB Connection

In [221]:
from sshtunnel import SSHTunnelForwarder

class SSHMySQLConnector:
    def __init__(self):
        self.ssh_host = None
        self.ssh_username = None
        self.ssh_password = None
        self.db_username = None
        self.db_password = None
        self.db_name = None
        self.tunnel = None
        self.connection = None

    def load_config_from_json(self, json_path):
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                config = json.load(f)
                self.ssh_host = config['ssh_host']
                self.ssh_username = config['ssh_username']
                self.ssh_password = config['ssh_password']
                self.db_username = config['db_username']
                self.db_password = config['db_password']
                self.db_name = config['db_name']
        except Exception as e:
            print("❌ 설정 JSON 로딩 실패:", e)


    def connect(self):
        try:
            self.tunnel = SSHTunnelForwarder(
                (self.ssh_host, 22),
                ssh_username=self.ssh_username,
                ssh_password=self.ssh_password,
                remote_bind_address=('127.0.0.1', 3306),
            )
            self.tunnel.start()
            
            self.connection = pymysql.connect(
                host='127.0.0.1',
                port=self.tunnel.local_bind_port,
                user=self.db_username,
                password=self.db_password,
                db=self.db_name,
                cursorclass=pymysql.cursors.DictCursor  # 이 줄 추가
            )
            print("✅ DB 접속 성공")
        except Exception as e:
            print("❌ SSH 또는 DB 연결 실패:", e)

    def insert_query_with_lookup(self, table_name, data_list):
        try:
            with self.connection.cursor() as cursor:
                for data in data_list:
                    # 1. op_member에서 uid, user_id 조회
                    cursor.execute("""
                        SELECT uid, user_id FROM op_member
                        WHERE add1 = %s
                        LIMIT 1
                    """, (data['acnt_nm'],))
                    result = cursor.fetchone()
                    if result:
                        data['member_uid'] = result['uid']
                        data['user_id'] = result['user_id']
                        # 향후에 ig_user_id가 추가가 된다면, 해당 부분도 확인해서 추가할 수 있게
                        # data['ig_user_id'] = result['ig_user_id']
                    else:
                        data['member_uid'] = 0
                        data['user_id'] = 'None'
                        # data['ig_user_id'] = 'None'

                    # 2. INSERT 쿼리 구성 및 실행
                    columns = ', '.join(data.keys())
                    placeholders = ', '.join([f"%({k})s" for k in data.keys()])
                    insert_sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
                    cursor.execute(insert_sql, data)

                    print(f"✅ inserted acnt_id: {data.get('acnt_id', 'N/A')}")

            self.connection.commit()
        except Exception as e:
            self.connection.rollback()
            print("❌ INSERT 실패:", e)

    def close(self):
        if self.connection:
            self.connection.close()
        if self.tunnel:
            self.tunnel.stop()

In [7]:
ssh = SSHMySQLConnector()
ssh.load_config_from_json('C:/Users/ehddl/Desktop/업무/code/config/ssh_db_config.json')
ssh.connect()

# data_list = [
#     {
#         'activity_score': 0.18618008163528707,
#         'trend_score': 1.2384506112322682,
#         'follower_total_engagement': 0.9006925508419993,
#         'follower_retention_rate': 4.84593837535013,
#         'avg_post_efficiency': 1.4776093002849746,
#         'acnt_id': '17841400070132367',
#         'acnt_nm': 'gnuoyeatt',
#         'influencer_scale_type': 'nano'
#     }
# ]

# ssh.insert_query_with_lookup('op_mem_seller_score', data_list)


✅ DB 접속 성공


Data Loading

In [162]:
# def load_s3_instagram_data():

#     load_dotenv()
#     aws_access_key = os.getenv("aws_accessKey")
#     aws_secret_key = os.getenv("aws_secretKey")
#     region_name='ap-northeast-2'

#     # S3 클라이언트 생성
#     client = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=region_name)

#     today = datetime.now()
#     year, week, _ = today.isocalendar()

#     bucket_name = 'flexmatch-data'

#     table_list = ['RECENT_USER_INFO_MTR', 'TIME_SERIES_PROFILE_INFO', 'BY_USER_ID_MEDIA_DTL_INFO', 'BY_DATE_MEDIA_AGG_INFO']
#     df_by_table_list = {}

#     for table_name in table_list:
#         prefix = f'instagram-data/tables/{table_name}/year={year}/week=25/' # 이번주가 없어서 week를 따로 x
#         response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

#         if 'Contents' in response:
#             files = [content['Key'] for content in response['Contents'] if content['Key'].endswith('merged_data.parquet')]

#             if not files:
#                 print(f"[Info] No merged_data.parquet found for table: {table_name}")
#                 continue

#             # 각 파일 순회
#             for file_key in files:
#                 try:
#                     obj = client.get_object(Bucket=bucket_name, Key=file_key)
#                     df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
#                     df_by_table_list[table_name] = df
#                     print(f"[Success] Loaded {file_key} for table {table_name}")
#                 except Exception as e:
#                     print(f"[Error] Failed to read {file_key} for table {table_name}: {e}")
#         else:
#             print(f"[Info] No files found under prefix: {prefix}")
        
#     return df_by_table_list

# df_by_table_list = load_s3_instagram_data()

# # recent_user_info_mtr_2 = df_by_table_list['RECENT_USER_INFO_MTR']
# # time_series_profile_info_2 = df_by_table_list['TIME_SERIES_PROFILE_INFO']
# # by_user_id_media_dtl_info_2 = df_by_table_list['BY_USER_ID_MEDIA_DTL_INFO']
# # by_date_media_agg_info_2 = df_by_table_list['BY_DATE_MEDIA_AGG_INFO']

# recent_user_info_mtr = df_by_table_list['RECENT_USER_INFO_MTR']
# time_series_profile_info = df_by_table_list['TIME_SERIES_PROFILE_INFO']
# by_user_id_media_dtl_info = df_by_table_list['BY_USER_ID_MEDIA_DTL_INFO']
# by_date_media_agg_info = df_by_table_list['BY_DATE_MEDIA_AGG_INFO']



In [31]:
def load_weekly_instagram_data(bucket_name, table_list, target_filename='merged_data.parquet'):
    # 환경 변수 로딩
    load_dotenv()
    aws_access_key = os.getenv("aws_accessKey")
    aws_secret_key = os.getenv("aws_secretKey")

    client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name='ap-northeast-2'
    )

    today = datetime.now()
    yesterday = (today - timedelta(days=1))

    today_date = datetime.now().strftime('%Y-%m-%d')
    yesterday_date = yesterday.strftime('%Y-%m-%d')

    # 결과 저장용 딕셔너리 초기화
    merged_data_by_table = {table_name: {} for table_name in table_list}

    # 주차별로 데이터 로딩
    recent_dates = [yesterday_date, today_date]
    recent_data_by_table = {}

    for table_name in table_list:
        recent_data_by_table[table_name] = {}

        for date_str in recent_dates:
            prefix = f'instagram-data/tables/{table_name}/{date_str}/'
            response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

            if 'Contents' not in response:
                print(f"[Info] No files found under prefix: {prefix}")
                continue

            target_files = [
                content['Key']
                for content in response['Contents']
                if content['Key'].endswith(target_filename)
            ]

            if not target_files:
                print(f"[Info] No {target_filename} found for {table_name} on date={date_str}")
                continue

            for file_key in target_files:
                try:
                    obj = client.get_object(Bucket=bucket_name, Key=file_key)
                    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
                    recent_data_by_table[table_name][date_str] = df
                    print(f"[Success] Loaded {file_key} for table {table_name}, date={date_str}")
                except Exception as e:
                    print(f"[Error] Failed to read {file_key} for {table_name}, date={date_str}: {e}")

    final_data = {}
    for table_name, date_data in recent_data_by_table.items():
        if today_date in date_data and yesterday_date in date_data:
            final_data[table_name] = {
                'yesterday': date_data[yesterday_date],
                'today': date_data[today_date]
            }
        else:
            print(f"[Warning] Missing yesterday or today data for table {table_name}")

    return final_data


In [32]:
bucket_name = 'flexmatch-data'
table_list = [
    'CONN_v2_RECENT_USER_INFO_MTR',
    'CONN_v2_TIME_SERIES_PROFILE_INFO',
    'CONN_v2_BY_USER_ID_MEDIA_DTL_INFO',
    'CONN_v2_BY_DATE_MEDIA_AGG_INFO',
    'CONN_v2_PROFILE_INSIGHT_DTL',
    'CONN_v2_MEDIA_INSIGHT_CUM',
    'CONN_v2_PROFILE_INSIGHT_FOLLOWTYPE'
]

merged_data_by_table = load_weekly_instagram_data(bucket_name, table_list)


[Success] Loaded instagram-data/tables/CONN_v2_RECENT_USER_INFO_MTR/2025-07-13/merged_data.parquet for table CONN_v2_RECENT_USER_INFO_MTR, date=2025-07-13
[Success] Loaded instagram-data/tables/CONN_v2_RECENT_USER_INFO_MTR/2025-07-14/merged_data.parquet for table CONN_v2_RECENT_USER_INFO_MTR, date=2025-07-14
[Success] Loaded instagram-data/tables/CONN_v2_TIME_SERIES_PROFILE_INFO/2025-07-13/merged_data.parquet for table CONN_v2_TIME_SERIES_PROFILE_INFO, date=2025-07-13
[Success] Loaded instagram-data/tables/CONN_v2_TIME_SERIES_PROFILE_INFO/2025-07-14/merged_data.parquet for table CONN_v2_TIME_SERIES_PROFILE_INFO, date=2025-07-14
[Success] Loaded instagram-data/tables/CONN_v2_BY_USER_ID_MEDIA_DTL_INFO/2025-07-13/merged_data.parquet for table CONN_v2_BY_USER_ID_MEDIA_DTL_INFO, date=2025-07-13
[Success] Loaded instagram-data/tables/CONN_v2_BY_USER_ID_MEDIA_DTL_INFO/2025-07-14/merged_data.parquet for table CONN_v2_BY_USER_ID_MEDIA_DTL_INFO, date=2025-07-14
[Success] Loaded instagram-data/ta

In [306]:
recent_user_info_mtr = merged_data_by_table['CONN_v2_RECENT_USER_INFO_MTR']['yesterday']
time_series_profile_info = merged_data_by_table['CONN_v2_TIME_SERIES_PROFILE_INFO']['yesterday']
by_user_id_media_dtl_info = merged_data_by_table['CONN_v2_BY_USER_ID_MEDIA_DTL_INFO']['yesterday']
by_date_media_agg_info = merged_data_by_table['CONN_v2_BY_DATE_MEDIA_AGG_INFO']['yesterday']

recent_user_info_mtr_2 = merged_data_by_table['CONN_v2_RECENT_USER_INFO_MTR']['today']
time_series_profile_info_2 = merged_data_by_table['CONN_v2_TIME_SERIES_PROFILE_INFO']['today']
by_user_id_media_dtl_info_2 = merged_data_by_table['CONN_v2_BY_USER_ID_MEDIA_DTL_INFO']['today']
by_date_media_agg_info_2 = merged_data_by_table['CONN_v2_BY_DATE_MEDIA_AGG_INFO']['today']

conn_profile_insight = merged_data_by_table['CONN_v2_PROFILE_INSIGHT_DTL']['yesterday']
conn_profile_insight_2 = merged_data_by_table['CONN_v2_PROFILE_INSIGHT_DTL']['today']

conn_media_insight = merged_data_by_table['CONN_v2_MEDIA_INSIGHT_CUM']['yesterday']
conn_media_insight_2 = merged_data_by_table['CONN_v2_MEDIA_INSIGHT_CUM']['today']

conn_profile_insight_followtype = merged_data_by_table['CONN_v2_PROFILE_INSIGHT_FOLLOWTYPE']['yesterday']
conn_profile_insight_followtype_2 = merged_data_by_table['CONN_v2_PROFILE_INSIGHT_FOLLOWTYPE']['today']

basic preprocessing

In [None]:
# recent_user_info_mtr = recent_user_info_mtr.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
# time_series_profile_info = time_series_profile_info.drop_duplicates(subset=['base_ymd', 'acnt_id'])
# by_user_id_media_dtl_info = by_user_id_media_dtl_info.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
# by_date_media_agg_info = by_date_media_agg_info.drop_duplicates(subset=['base_ymd', 'media_id'])

# recent_user_info_mtr_2 = recent_user_info_mtr_2.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
# time_series_profile_info_2 = time_series_profile_info_2.drop_duplicates(subset=['base_ymd', 'acnt_id'])
# by_user_id_media_dtl_info_2 = by_user_id_media_dtl_info_2.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
# by_date_media_agg_info_2 = by_date_media_agg_info_2.drop_duplicates(subset=['base_ymd', 'media_id'])

# conn_profile_insight = conn_profile_insight.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
# conn_media_insight = conn_media_insight.drop_duplicates(subset=['acnt_id', 'acnt_nm', 'media_id'])

# conn_profile_insight_2 = conn_profile_insight_2.drop_duplicates(subset=['acnt_id', 'acnt_nm'])
# conn_media_insight_2 = conn_media_insight_2.drop_duplicates(subset=['acnt_id', 'acnt_nm', 'media_id'])


가장 최근 데이터로 isin 해줘야 할 것 같음. 그래야 전, 후를 비교할 수 있기 때문임

In [331]:
# 각 acnt_id가 있는 전부 있는 경우만 가능, isin으로 제거
# 근데 전 주차에는 없는 유저가 생길수도 있음
# media 같은 경우에도 최근 100개만 일단 가져오고 있기 때문에 일주일 단위여도 없을수도 있음

# 이렇게 해야하는 이유는 timeseries 변수를 계산하려면 무조건 두 주차에 같은 아이디가 있어야 함
# 또한, user info랑 media info 같이 써야하는 경우에는 어짜피 한번 거르고 있을 듯
# -> 따라서, 굳이 필수적으로 하지는 않아도 될듯함

# unique_user = recent_user_info_mtr['acnt_id'].unique()
c_unique_user = recent_user_info_mtr_2[recent_user_info_mtr_2['acnt_conn_yn']=='Y']['acnt_id'].to_list()
c_recent_user_info_mtr_2 = recent_user_info_mtr_2[recent_user_info_mtr_2['acnt_id'].isin(c_unique_user)]

c_time_series_profile_info = time_series_profile_info[time_series_profile_info['acnt_id'].isin(c_unique_user)]
c_time_series_profile_info_2 = time_series_profile_info_2[time_series_profile_info_2['acnt_id'].isin(c_unique_user)]

# by_user_id_media_dtl_info = by_user_id_media_dtl_info[by_user_id_media_dtl_info['acnt_id'].isin(c_unique_user)]
c_by_user_id_media_dtl_info_2 = by_user_id_media_dtl_info_2[by_user_id_media_dtl_info_2['acnt_id'].isin(c_unique_user)]

c_conn_profile_insight = conn_profile_insight[conn_profile_insight['acnt_id'].isin(c_unique_user)]
c_conn_profile_insight_2 = conn_profile_insight_2[conn_profile_insight_2['acnt_id'].isin(c_unique_user)]

c_conn_profile_insight_followtype = conn_profile_insight_followtype[conn_profile_insight_followtype['acnt_id'].isin(c_unique_user)]
c_conn_profile_insight_followtype_2 = conn_profile_insight_followtype_2[conn_profile_insight_followtype_2['acnt_id'].isin(c_unique_user)]

# connected_user 같은 경우에는 conn_media_insight 안에 있는 게 media_agg랑 동일하기 때문에 해당 부분을 쓰면 당장은 문제가 없음
unique_media = c_by_user_id_media_dtl_info_2['media_id'].unique()
# c_by_date_media_agg_info_2 = by_date_media_agg_info_2[by_date_media_agg_info_2['media_id'].isin(unique_media)]
c_conn_media_insight_2 = conn_media_insight_2[conn_media_insight_2['media_id'].isin(unique_media)]

In [315]:
pd.options.display.max_columns = None
c_conn_media_insight_2[(c_conn_media_insight_2['acnt_id']=='17841453615191128')&(c_conn_media_insight_2['views_cnt']==0)].sort_values(by=['views_cnt'], ascending=True)

Unnamed: 0,acnt_id,acnt_nm,media_id,acnt_conn_yn,media_type_nm,reels_feed_type_nm,reg_dt,media_cn,like_cnt,share_cnt,views_cnt,save_cnt,cmnt_cnt,reach_cnt,follow_cnt,profile_visit_cnt,profile_activ_cnt,contents_intac_cnt,reels_view_total_time,reels_view_avg_time,base_ymd


In [354]:
def influencer_scale_type(row):
    count = row['follower_cnt']
    if count < 1000:
        return 'nano'
    elif 1000 <= count <= 10000:
        return 'micro'
    elif 10000 < count <= 100000:
        return 'mid'
    elif 100000 < count <= 500000:
        return 'macro'
    else:
        return 'mega'

# recent_user_info_mtr.loc[:, 'influencer_scale_type'] = recent_user_info_mtr.apply(influencer_scale_type, axis=1)
c_recent_user_info_mtr_2.loc[:,'influencer_scale_type'] = c_recent_user_info_mtr_2.apply(influencer_scale_type, axis=1)


create merged file

In [355]:
def create_merged_df(user_info_df, timeseries_df, timeseries_df_2, media_info_df, media_insight_df, user_followtype_df, user_followtype_df_2): # media_agg, profile_insight X
    # merge 시에 같은 이름의 열이 두개여서 error 발생하기 때문에 insight에서는 삭제
    media_insight = media_insight_df.drop(['acnt_id'], axis=1)
    media_engagement_merged_df = pd.merge(media_info_df, media_insight, on='media_id', how='outer')
    # print(len(media_engagement_merged_df['acnt_id'].unique()))

    ### 방법 1
    # 단 한개의 게시물이라도 like가 비공개인 influencer 제거 & media_cnt_가 0인 사람도 제외
    # by_user_na_like_count = media_engagement_merged_df[media_engagement_merged_df['like_cnt'].isna()].groupby(['acnt_id'])['media_id'].count()
    # na_like_user = by_user_na_like_count[by_user_na_like_count > 0].index
    # no_media_user = user_info_df[user_info_df['media_cnt'] == 0]['acnt_id'].to_list()
    # except_user = list(na_like_user) + no_media_user
    # media_engagement_merged_df = media_engagement_merged_df[~media_engagement_merged_df['acnt_id'].isin(except_user)].reset_index()

    ### 방법 2
    # 미디어가 한 개도 없는 유저 제거 & 게시물의 like가 비공개인 경우에는 그걸 제외한 게시물의 좋아요 평균으로 채워넣기
    # 근데 여기서 like만 비공인건지 나머지 값들도 비공이 되는건지 확인을 하긴 해야함
    # mean 값 자체가 nan인 사람들도 제외
    # 근데 여기서 프로페셔널 전환은 이미 되어있지만, views라는 지표 자체가 나중에 나와서 전부 0으로 찍히는 경우가 있음. -> 어떻게 해결?
    no_media_user = user_info_df[user_info_df['media_cnt'] == 0]['acnt_id'].to_list()
    media_engagement_merged_df = media_engagement_merged_df[~media_engagement_merged_df['acnt_id'].isin(no_media_user)].reset_index()

    media_engagement_merged_groupby_df = media_engagement_merged_df.groupby('acnt_id')[['like_cnt', 'cmnt_cnt', 'share_cnt', 'save_cnt', 'views_cnt', 'reach_cnt']].mean()
    media_engagement_merged_groupby_df = np.ceil(media_engagement_merged_groupby_df)
    fillna_user = media_engagement_merged_groupby_df[media_engagement_merged_groupby_df['like_cnt'] > 1].index

    media_engagement_merged_df = media_engagement_merged_df[media_engagement_merged_df['acnt_id'].isin(fillna_user)].reset_index()

    engagement_cols = ['like_cnt', 'cmnt_cnt', 'share_cnt', 'save_cnt', 'views_cnt', 'reach_cnt']
    for col in engagement_cols:
        media_engagement_merged_df[col] = media_engagement_merged_df.apply(
        lambda row: media_engagement_merged_groupby_df.at[row['acnt_id'], col] if pd.isna(row[col]) else row[col], axis=1)

    user_list = media_engagement_merged_df['acnt_id'].unique()
    media_list = media_engagement_merged_df['media_id'].unique()

    # merge하면서 제거된 리스트가 있기 때문에, 해당 부분 다시 삭제 후에 새로운 merge 파일 생성
    user_info = user_info_df[user_info_df['acnt_id'].isin(user_list)]
    timeseries = timeseries_df[timeseries_df['acnt_id'].isin(user_list)]
    timeseries_2 = timeseries_df_2[timeseries_df_2['acnt_id'].isin(user_list)]
    media_info = media_info_df[media_info_df['acnt_id'].isin(user_list)]
    
    user_followtype = user_followtype_df[user_followtype_df['acnt_id'].isin(user_list)]
    user_followtype_2 = user_followtype_df_2[user_followtype_df_2['acnt_id'].isin(user_list)]
    
    media_insight_info = media_insight[media_insight['media_id'].isin(media_list)]

    all_merged_df_a = pd.merge(user_info, timeseries, on='acnt_id')
    all_merged_df_b = pd.merge(all_merged_df_a, media_info, on='acnt_id')
    all_merged_df = pd.merge(all_merged_df_b, media_insight_info, on='media_id')
    
    media_engagement_profile_merged_df = pd.merge(media_engagement_merged_df, user_info_df, on='acnt_id')
    time_series_merged_df = pd.merge(timeseries, timeseries_df_2, on='acnt_id')

    return user_info, timeseries, timeseries_2, user_followtype, user_followtype_2, media_info, media_insight_info, all_merged_df, media_engagement_merged_df, media_engagement_profile_merged_df, time_series_merged_df

In [356]:
user_info, timeseries, timeseries_2, user_followtype, user_followtype_2, media_info, media_agg, all_merged_df, media_engagement_merged_df, media_engagement_profile_merged_df, time_series_merged_df = create_merged_df(
                                                                                                                        c_recent_user_info_mtr_2,
                                                                                                                        c_time_series_profile_info,
                                                                                                                        c_time_series_profile_info_2,
                                                                                                                        c_by_user_id_media_dtl_info_2,
                                                                                                                        c_conn_media_insight_2,
                                                                                                                        c_conn_profile_insight_followtype,
                                                                                                                        c_conn_profile_insight_followtype_2)

크리에이터 활동성

- 실제 게시물 업로드 날짜를 기준으로 계산할 수 있을 것 같음
- 최근 100개의 게시물의 평균 게시 간격을 계산

In [357]:
def calculate_activity_score(recent_media_dtl_df): # 두 개의 테이블 중 가장 최근
    media_dtl_copy = recent_media_dtl_df.copy()
    media_dtl_copy = media_dtl_copy.drop_duplicates(subset=['acnt_id', 'media_id', 'media_cn'])
    media_dtl_copy['reg_dt'] = pd.to_datetime(media_dtl_copy['reg_dt'])
    media_dtl_copy = media_dtl_copy.sort_values(['acnt_id', 'reg_dt'])

    # 게시물 간격 계산
    media_dtl_copy['prev_reg_dt'] = media_dtl_copy.groupby('acnt_id')['reg_dt'].shift(1)
    media_dtl_copy['gap_days'] = (media_dtl_copy['reg_dt'] - media_dtl_copy['prev_reg_dt']).dt.days

    # gap_days가 NaN인 첫 번째 포스트 제외 후 평균 간격 계산
    activity_df = media_dtl_copy.dropna(subset=['gap_days']).groupby('acnt_id')['gap_days'].mean().reset_index()
    activity_df.rename(columns={'gap_days': 'avg_upload_interval'}, inplace=True)

    # 활동성 점수 계산 (간격의 역수로 환산) -> 점수 정규화 (업로드 간격이 짧을수록 점수가 높아지도록 역수를 취해서 계산한 것)
    # 업로드 간격이 너무 짧은 유저의 경우 inf로 계산되는 것을 방지하기 위해서 scaling 진행
    activity_df['avg_upload_interval'] = activity_df['avg_upload_interval'].replace(0, 0.1)
    activity_df['activity_score'] =  activity_df['avg_upload_interval'] / 100
    
    return activity_df

In [358]:
activity_df = calculate_activity_score(media_info)

In [359]:
activity_df

Unnamed: 0,acnt_id,avg_upload_interval,activity_score
0,17841400360358101,120.0,1.2
1,17841400624415491,21.545455,0.215455
2,17841409045873013,292.0,2.92
3,17841449549923448,3.191919,0.031919
4,17841450980480576,12.070707,0.120707
5,17841453615191128,1.0,0.01


In [360]:
def check_inf(df):
    float_cols = df.select_dtypes(include=['float64', 'float32']).columns

    mask_inf = np.isinf(df[float_cols]).any(axis=1)
    mask_neginf = np.isneginf(df[float_cols]).any(axis=1)

    invalid_rows = df[mask_inf | mask_neginf]

    print(f"⚠️ inf / -inf 포함 행 개수: {len(invalid_rows)}개")
    display(invalid_rows)

In [361]:
check_inf(activity_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,avg_upload_interval,activity_score


트렌드지수 - 팔로워 순 증가율 (follower_growth_rate)

In [362]:
# time_series_profile_info_2['acnt_id'] = time_series_profile_info_2['acnt_id'].astype(object)

# influencer_list = time_series_profile_info['acnt_id'].unique()
# time_series_profile_info_2[time_series_profile_info_2['acnt_id'].isin(influencer_list)]
# time_series_merged_df = pd.merge(time_series_profile_info, time_series_profile_info_2, on='acnt_id')

# time_series_merged_df['trend_score'] = ((time_series_merged_df['follower_cnt_y'] - time_series_merged_df['follower_cnt_x']) / (time_series_merged_df['follower_cnt_x'])) * 100
# time_series_merged_df[['acnt_id', 'trend_score']]

In [363]:
def calculate_follower_growth_rate(time_series_df, recent_time_series_df):
    time_series_df.loc[:, 'acnt_id'] = time_series_df['acnt_id'].astype(object)
    recent_time_series_df.loc[:, 'acnt_id'] = recent_time_series_df['acnt_id'].astype(object)

    influencer_list = time_series_df['acnt_id'].unique()
    recent_time_series_df = recent_time_series_df[recent_time_series_df['acnt_id'].isin(influencer_list)]
    time_series_merged_df = pd.merge(time_series_df, recent_time_series_df, on='acnt_id')

    time_series_merged_df['follow_growth_rate'] = ((time_series_merged_df['follower_cnt_y'] - time_series_merged_df['follower_cnt_x']) / (time_series_merged_df['follower_cnt_x'])) * 100
    growth_rate_df = time_series_merged_df[['acnt_id', 'follow_growth_rate']]

    return growth_rate_df

In [364]:
growth_rate_df = calculate_follower_growth_rate(timeseries, timeseries_2)

In [365]:
growth_rate_df

Unnamed: 0,acnt_id,follow_growth_rate
0,17841450980480576,0.0
1,17841409045873013,-1.886792
2,17841400360358101,0.813008
3,17841449549923448,-0.017911
4,17841453615191128,0.271194
5,17841400624415491,0.0


##### 팔로워 참여도

media_id는 있는데 프로필에 미디어가 없어서 0개인 사람이 있음


In [366]:
def calculate_follower_engagement(media_engagement_profile_merged_df):
    media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df[['acnt_id', 'media_id', 'follower_cnt', 'like_cnt', 'cmnt_cnt', 'share_cnt', 'save_cnt', 'views_cnt', 'reach_cnt', 'media_cnt']]
    # media_id는 조회가 되지만 실제로 media_cnt는 없는 경우가 있음
    # media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df_copy[media_engagement_profile_merged_df_copy['media_cnt'] != 0]
    
    engaged_df = media_engagement_profile_merged_df_copy.groupby(['acnt_id']).agg({
        'like_cnt' : 'sum',
        'cmnt_cnt' : 'sum',
        'share_cnt' : 'sum',
        'save_cnt' : 'sum',
        'media_cnt': 'first',
        'follower_cnt' : 'first',
    }).reset_index()

    engaged_df['estimated_total_engagement'] = ((engaged_df['like_cnt'] + engaged_df['cmnt_cnt'] + engaged_df['share_cnt'] + engaged_df['save_cnt']) / ( engaged_df['media_cnt']*engaged_df['follower_cnt']))
    engaged_df['follower_total_engagement'] = engaged_df['estimated_total_engagement'] * 100
    
    follower_engagment_df = engaged_df

    return follower_engagment_df

In [367]:
follower_engagment_df = calculate_follower_engagement(media_engagement_profile_merged_df)
follower_engagment_df

Unnamed: 0,acnt_id,like_cnt,cmnt_cnt,share_cnt,save_cnt,media_cnt,follower_cnt,estimated_total_engagement,follower_total_engagement
0,17841400360358101,300.0,30.0,0.0,12.0,18,124,0.153226,15.322581
1,17841400624415491,2400.0,0.0,0.0,0.0,244,439,0.022406,2.240562
2,17841409045873013,44.0,0.0,0.0,4.0,4,104,0.115385,11.538462
3,17841449549923448,38447.0,1867.0,4784.0,11506.0,374,50240,0.003012,0.301249
4,17841450980480576,15592.0,553.0,0.0,47.0,116,722,0.193333,19.33327
5,17841453615191128,74593.0,16364.0,37833.0,31004.0,1044,35495,0.004312,0.431214


In [368]:
check_inf(follower_engagment_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,like_cnt,cmnt_cnt,share_cnt,save_cnt,media_cnt,follower_cnt,estimated_total_engagement,follower_total_engagement


##### 팔로워 충성도

- 비즈니스 계정 연동 시 follower_type 데이터 로딩 -> 실제 follower cnt랑 안맞음 (timeseries랑 안맞음)

In [369]:
def calculate_follower_loyalty(time_series_merged_df):
    time_series_merged_df_copy = time_series_merged_df[['acnt_id', 'follower_cnt_x', 'follower_cnt_y']].copy()

    time_series_merged_df_copy.loc[:, 'follower_change'] = (time_series_merged_df_copy['follower_cnt_y'] - time_series_merged_df_copy['follower_cnt_x'])

    def estimate_new_follower(row):
        if row['follower_change'] < 0:
            return 0
        else:
            return row['follower_change']

    time_series_merged_df_copy.loc[:, 'new_follower'] = time_series_merged_df_copy.apply(estimate_new_follower, axis=1)
    time_series_merged_df_copy.loc[:, 'unfollowed'] = time_series_merged_df_copy['follower_cnt_x'] + time_series_merged_df_copy['new_follower'] - time_series_merged_df_copy['follower_cnt_y']
    time_series_merged_df_copy.loc[:, 'follower_retention_rate'] = ((time_series_merged_df_copy['follower_cnt_x'] - time_series_merged_df_copy['unfollowed']) / time_series_merged_df_copy['follower_cnt_x']) * 100
    time_series_merged_df_copy.loc[:, 'follower_retention_rate'] = time_series_merged_df_copy['follower_retention_rate'].round(2)

    follower_loyalty_df = time_series_merged_df_copy

    return follower_loyalty_df

In [370]:
follower_loyalty_df = calculate_follower_loyalty(time_series_merged_df)
follower_loyalty_df

Unnamed: 0,acnt_id,follower_cnt_x,follower_cnt_y,follower_change,new_follower,unfollowed,follower_retention_rate
0,17841450980480576,722,722,0,0,0,100.0
1,17841409045873013,106,104,-2,0,2,98.11
2,17841400360358101,123,124,1,1,0,100.0
3,17841449549923448,50249,50240,-9,0,9,99.98
4,17841453615191128,35399,35495,96,96,0,100.0
5,17841400624415491,439,439,0,0,0,100.0


In [371]:
check_inf(follower_loyalty_df)

⚠️ inf / -inf 포함 행 개수: 0개


Unnamed: 0,acnt_id,follower_cnt_x,follower_cnt_y,follower_change,new_follower,unfollowed,follower_retention_rate


콘텐츠 효율성 평가

- 콘텐츠 1개당 팔로워 대비 반응량(like+comments+share+save)
- media_agg & media_dtl & acnt_id가 필요 -> 전부 병합한 게 media_engagement_merged_df
- 단, 프로페셔널 전환은 이전에 했으나, views라는 지표가 나중에 나와서 0으로 찍히는 애들이 있음 -> 이런 부분은 전부 inf로 계산이 되는데 전부 제거하고 mean을 구하는 것으로 !


In [387]:
def calculate_post_efficiency_df(media_engagement_profile_merged_df):
    media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df.copy()

    media_engagement_profile_merged_df_copy['post_efficiency'] = ((media_engagement_profile_merged_df_copy['like_cnt'] + media_engagement_profile_merged_df_copy['cmnt_cnt'] + media_engagement_profile_merged_df_copy['save_cnt'] + media_engagement_profile_merged_df_copy['share_cnt']) / media_engagement_profile_merged_df_copy['follower_cnt']) * 100
    # media_engagement_profile_merged_df_copy['post_efficiency'] = ((media_engagement_profile_merged_df_copy['like_cnt'] + media_engagement_profile_merged_df_copy['cmnt_cnt'] + media_engagement_profile_merged_df_copy['save_cnt'] + media_engagement_profile_merged_df_copy['share_cnt']) / media_engagement_profile_merged_df_copy['views_cnt']) * 100
    post_efficiency_df = media_engagement_profile_merged_df_copy.groupby('acnt_id')['post_efficiency'].mean().reset_index()
    post_efficiency_df.rename(columns={'post_efficiency': 'avg_post_efficiency'}, inplace=True)

    return post_efficiency_df

In [388]:
post_efficiency_df = calculate_post_efficiency_df(media_engagement_profile_merged_df)
post_efficiency_df

Unnamed: 0,acnt_id,avg_post_efficiency
0,17841400360358101,15.322581
1,17841400624415491,5.46697
2,17841409045873013,11.538462
3,17841449549923448,1.126672
4,17841450980480576,22.426593
5,17841453615191128,4.501874


콘텐츠 인기도 평가

- 콘텐츠 1개당 조회수 대비 반응량(like+comments+share+save)
- media_agg & media_dtl & acnt_id가 필요 -> 전부 병합한 게 media_engagement_merged_df
- 단, 프로페셔널 전환은 이전에 했으나, views라는 지표가 나중에 나와서 0으로 찍히는 애들이 있음 -> 이런 부분은 전부 inf로 계산이 되는데 전부 제거하고 mean을 구하는 것으로 !

In [395]:
def calculate_post_efficiency_df(media_engagement_profile_merged_df):
    media_engagement_profile_merged_df_copy = media_engagement_profile_merged_df.copy()

    media_engagement_profile_merged_df_copy['post_popularity'] = ((media_engagement_profile_merged_df_copy['like_cnt'] + media_engagement_profile_merged_df_copy['cmnt_cnt'] + media_engagement_profile_merged_df_copy['save_cnt'] + media_engagement_profile_merged_df_copy['share_cnt']) / media_engagement_profile_merged_df_copy['views_cnt']) * 100
    # check_inf(media_engagement_profile_merged_df_copy)
    media_engagement_profile_merged_df_copy.replace([np.inf, -np.inf], np.nan, inplace=True)
    media_engagement_profile_merged_df_copy.dropna(subset=['post_popularity'], inplace=True)

    post_popularity_df = media_engagement_profile_merged_df_copy.groupby('acnt_id')['post_popularity'].mean().reset_index()
    post_popularity_df.rename(columns={'post_popularity': 'avg_post_popularity'}, inplace=True)

    return post_popularity_df

In [396]:
post_popularity_df = calculate_post_efficiency_df(media_engagement_profile_merged_df)
post_popularity_df

Unnamed: 0,acnt_id,avg_post_popularity
0,17841400360358101,11.057678
1,17841400624415491,6.545388
2,17841409045873013,2.020202
3,17841449549923448,4.203572
4,17841450980480576,841.965152
5,17841453615191128,3.348214


##### connected_influencer_flexmatch_score

In [397]:
def connected_user_flexmatch_score(activity_df, growth_rate_df, follower_engagement_df, follower_loyalty_df, post_efficiency_df, post_popularity_df):
    # 크리에이터 활동성
    creator_activity_score = activity_df[['acnt_id', 'activity_score']]
    # 트렌드지수 (팔로워 순변화량)
    creator_follow_growth_rate = growth_rate_df[['acnt_id', 'follow_growth_rate']] # db 변수명 수정
    # 팔로워 참여도
    follower_engagement = follower_engagement_df[['acnt_id', 'follower_total_engagement']]
    # 팔로워 충성도
    follower_loyalty = follower_loyalty_df[['acnt_id', 'follower_retention_rate']]
    # 콘텐츠 효율성
    post_efficiency = post_efficiency_df[['acnt_id', 'avg_post_efficiency']]
    # 콘텐츠 인기도
    post_popularity = post_popularity_df[['acnt_id', 'avg_post_popularity']]


    # data_list
    df_list = [creator_activity_score, creator_follow_growth_rate, follower_engagement, follower_loyalty, post_efficiency, post_popularity]

    from functools import reduce

    flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)
    user_info_nm = user_info[['acnt_id', 'acnt_nm', 'influencer_scale_type']]
    flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')
    flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'influencer_scale_type', 'activity_score', 'follow_growth_rate', 'follower_total_engagement', 'follower_retention_rate', 'avg_post_efficiency', 'avg_post_popularity']]

    connected_flexmatch_score_table = flexmatch_score.copy()
    connected_flexmatch_score_table.dropna(inplace=True)
    
    return connected_flexmatch_score_table

In [398]:
connected_flexmatch_score_table = connected_user_flexmatch_score(activity_df, growth_rate_df, follower_engagment_df, follower_loyalty_df, post_efficiency_df, post_popularity_df)
connected_flexmatch_score_table

Unnamed: 0,acnt_id,acnt_nm,influencer_scale_type,activity_score,follow_growth_rate,follower_total_engagement,follower_retention_rate,avg_post_efficiency,avg_post_popularity
0,17841400360358101,bro_bums,nano,1.2,0.813008,15.322581,100.0,15.322581,11.057678
1,17841400624415491,itsme_ddonni,nano,0.215455,0.0,2.240562,100.0,5.46697,6.545388
2,17841409045873013,__cherie_ssom,nano,2.92,-1.886792,11.538462,98.11,11.538462,2.020202
3,17841449549923448,heihwi,mid,0.031919,-0.017911,0.301249,99.98,1.126672,4.203572
4,17841450980480576,__dongx2_,nano,0.120707,0.0,19.33327,100.0,22.426593,841.965152
5,17841453615191128,bong_camper83,mid,0.01,0.271194,0.431214,100.0,4.501874,3.348214


In [399]:
nano = connected_flexmatch_score_table[connected_flexmatch_score_table['influencer_scale_type']=='nano']
micro = connected_flexmatch_score_table[connected_flexmatch_score_table['influencer_scale_type']=='micro']
mid = connected_flexmatch_score_table[connected_flexmatch_score_table['influencer_scale_type']=='mid']
macro = connected_flexmatch_score_table[connected_flexmatch_score_table['influencer_scale_type']=='macro']
mega = connected_flexmatch_score_table[connected_flexmatch_score_table['influencer_scale_type']=='mega']

In [None]:
# influencer_scale_names = ['nano', 'micro', 'mid', 'macro', 'mega']
# influencer_scale_df_list = [nano, micro, mid, macro, mega]
# normalized_df_dict = {}
# # normalized_df_list = []

# # 값이 작을 수록 좋은 값
# reverse_columns = ['activity_score']

# for name, df in zip(influencer_scale_names, influencer_scale_df_list):
#     cleaned = df.copy()

#     # 무한대 및 NaN 제거
#     float_cols = cleaned.select_dtypes(include='float64').columns
#     cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
#     cleaned = cleaned.dropna(subset=float_cols)

#     if cleaned.empty:
#         continue

#     norm_df = pd.DataFrame(index=cleaned.index)
#     for col in float_cols:
#         scaler = MinMaxScaler(feature_range=(0, 5))
#         norm_col = scaler.fit_transform(cleaned[[col]])
#         norm_df[col] = 5 - norm_col.ravel() if col in reverse_columns else norm_col.ravel()
#         # norm_df[col] = norm_col.ravel() if col in reverse_columns else norm_col.ravel()

#     # ID 및 이름 붙이기
#     norm_df['acnt_id'] = cleaned['acnt_id'].values
#     norm_df['acnt_nm'] = cleaned['acnt_nm'].values
#     norm_df['influencer_scale_type'] = name

#     # 딕셔너리에 저장
#     normalized_df_dict[name] = norm_df

# normalized_all_df = pd.concat(normalized_df_dict, ignore_index=True)
# normalized_all_dic = normalized_all_df.to_dict(orient='index')


In [409]:
# normalized_all_dic

In [406]:
def normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list, reverse_columns=None, feature_range=(0, 5)):

    if reverse_columns is None:
        reverse_columns = ['activity_score']

    normalized_df_dict = {}

    for name, df in zip(influencer_scale_names, influencer_scale_df_list):
        cleaned = df.copy()

        # 무한대 및 NaN 제거
        float_cols = cleaned.select_dtypes(include='float64').columns
        cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
        cleaned = cleaned.dropna(subset=float_cols)

        if cleaned.empty:
            continue

        norm_df = pd.DataFrame(index=cleaned.index)
        for col in float_cols:
            scaler = MinMaxScaler(feature_range=feature_range)
            norm_col = scaler.fit_transform(cleaned[[col]])
            if col in reverse_columns:
                norm_df[col] = feature_range[1] - norm_col.ravel()
            else:
                norm_df[col] = norm_col.ravel()

        # ID 및 이름, 스케일 타입 추가
        norm_df['acnt_id'] = cleaned['acnt_id'].values
        norm_df['acnt_nm'] = cleaned['acnt_nm'].values
        norm_df['influencer_scale_type'] = name

        normalized_df_dict[name] = norm_df

    normalized_all_df = pd.concat(normalized_df_dict.values(), ignore_index=True)
    normalized_all_dic = normalized_all_df.to_dict(orient='index')

    return normalized_all_df, normalized_all_dic


In [407]:
influencer_scale_names=['nano', 'micro', 'mid', 'macro', 'mega']
influencer_scale_df_list=[nano, micro, mid, macro, mega]

normalized_df, normalized_dic = normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list, reverse_columns='activity_score')


In [408]:
normalized_all_dic

{0: {'activity_score': 3.0722043806155956,
  'follow_growth_rate': 5.0,
  'follower_total_engagement': 3.826783619187695,
  'follower_retention_rate': 5.0,
  'avg_post_efficiency': 2.905610166117971,
  'avg_post_popularity': 0.05379802756951291,
  'acnt_id': '17841400360358101',
  'acnt_nm': 'bro_bums',
  'influencer_scale_type': 'nano'},
 1: {'activity_score': 4.830765344784036,
  'follow_growth_rate': 3.494318181818182,
  'follower_total_engagement': 0.0,
  'follower_retention_rate': 5.0,
  'avg_post_efficiency': 0.0,
  'avg_post_popularity': 0.026937394840734222,
  'acnt_id': '17841400624415491',
  'acnt_nm': 'itsme_ddonni',
  'influencer_scale_type': 'nano'},
 2: {'activity_score': 0.0,
  'follow_growth_rate': 0.0,
  'follower_total_engagement': 2.7198440100686727,
  'follower_retention_rate': 0.0,
  'avg_post_efficiency': 1.7899841766065634,
  'avg_post_popularity': 0.0,
  'acnt_id': '17841409045873013',
  'acnt_nm': '__cherie_ssom',
  'influencer_scale_type': 'nano'},
 3: {'activ

In [410]:
# def normalize_influencer_scores(influencer_scale_names, influencer_scale_df_list, reverse_columns=None, feature_range=(0, 5)):

#     if reverse_columns is None:
#         reverse_columns = ['activity_score']

#     normalized_df_list = []

#     for name, df in zip(influencer_scale_names, influencer_scale_df_list):
#         cleaned = df.copy()

#         if 'interest_category' not in cleaned.columns:
#             raise ValueError(f"'{name}' 데이터프레임에 'interest_category' 컬럼이 없습니다.")

#         # 무한대 및 NaN 제거
#         float_cols = cleaned.select_dtypes(include='float64').columns
#         cleaned[float_cols] = cleaned[float_cols].replace([np.inf, -np.inf], np.nan)
#         cleaned = cleaned.dropna(subset=float_cols)

#         if cleaned.empty:
#             continue

#         # 관심 카테고리별로 그룹화하여 정규화
#         grouped = cleaned.groupby('interest_category')

#         for category, group_df in grouped:
#             if group_df.empty:
#                 continue

#             norm_df = pd.DataFrame(index=group_df.index)

#             for col in float_cols:
#                 scaler = MinMaxScaler(feature_range=feature_range)
#                 norm_col = scaler.fit_transform(group_df[[col]])

#                 if col in reverse_columns:
#                     norm_df[col] = feature_range[1] - norm_col.ravel()
#                 else:
#                     norm_df[col] = norm_col.ravel()

#             norm_df['acnt_id'] = group_df['acnt_id'].values
#             norm_df['acnt_nm'] = group_df['acnt_nm'].values
#             norm_df['interest_category'] = group_df['interest_category'].values
#             norm_df['influencer_scale_type'] = name

#             normalized_df_list.append(norm_df)

#     normalized_all_df = pd.concat(normalized_df_list, ignore_index=True)
#     normalized_all_dic = normalized_all_df.to_dict(orient='index')

#     return normalized_all_df, normalized_all_dic


##### 크리에이터 광고 효율성 

In [385]:
revenue_dic = {
    'acnt_nm' : ['s_h_j_', 'siwolbubu_hyun', 'bong_camper83', 'binwoos', 'seojinii_', 'tingkerhee'],
    'sell_type' : ['flexmatch', 'other', 'flexmatch', 'flexmatch', 'other', 'other'],
    'total_revenue' : [6906000, 10937105, 7233100, 8759000, 7939664, 38449720],
    'total_order_cnt' : [84, 132, 122, 88, 89, 471]
    }

revenue_df = pd.DataFrame(revenue_dic)
revenue_merged_df = pd.merge(media_engagement_merged_df, revenue_df, on='acnt_nm', how='left')
revenue_merged_df = revenue_merged_df[['acnt_id', 'acnt_nm', 'follower_cnt', 'follow_cnt', 'media_cnt', 'sell_type', 'total_revenue', 'total_order_cnt', 'media_id', 'like_cnt', 'cmnt_cnt']]
revenue_merged_df

revenue_merged_df['post_efficiency'] = ((revenue_merged_df['like_cnt'] + revenue_merged_df['cmnt_cnt']) / revenue_merged_df['follower_cnt']) * 100
revenue_df_total = revenue_merged_df.groupby(['acnt_id', 'acnt_nm']).agg({
    'post_efficiency' : 'mean',
    'total_order_cnt' : 'first'
}).dropna()

revenue_df_total['advertisement_efficiency'] = (revenue_df_total['total_order_cnt'] / revenue_df_total['post_efficiency']) 
revenue_df_total.sort_values(by='advertisement_efficiency', ascending=False)

revenue_merged_df['engagement_per_post'] = ((revenue_merged_df['like_cnt'] + revenue_merged_df['cmnt_cnt']) / 25)
revenue_df_total = revenue_merged_df.groupby(['acnt_id', 'acnt_nm']).agg({
    'engagement_per_post' : 'mean',
    'total_revenue' : 'first',
    'total_order_cnt' : 'first'
}).dropna()

# 팔로워를 생각하지 않고, 25개의 콘텐츠 단위당 반응 비율을 계산
revenue_df_total['advertisement_efficiency'] = (revenue_df_total['total_order_cnt'] / revenue_df_total['engagement_per_post'] * 25) * 100
revenue_df_total.sort_values(by='advertisement_efficiency', ascending=False)

KeyError: "['follower_cnt', 'media_cnt'] not in index"

콘텐츠 효율성이 아닌 팔로워 참여도를 기준으로 계산했을 때

In [None]:
engaged_df_2 = engaged_df.copy()
revenue_merged_df_2 = revenue_merged_df.groupby(['acnt_id', 'acnt_nm'])[['total_revenue', 'total_order_cnt']].first().dropna().reset_index()
revenue_df_total_2 = pd.merge(engaged_df_2, revenue_merged_df_2, on='acnt_id')

revenue_df_total_2['advertisement_efficiency'] = revenue_df_total_2['total_order_cnt'] / revenue_df_total_2['follower_total_engagement']
revenue_df_total_2[['acnt_id', 'acnt_nm', 'total_revenue', 'total_order_cnt', 'follower_total_engagement', 'advertisement_efficiency']].sort_values(by='advertisement_efficiency', ascending=False)

table merged

In [None]:
# 크리에이터 활동성
creator_activity_score = activity_df[['acnt_id', 'avg_upload_interval']]

In [None]:
# 트렌드지수
creator_trend_score = time_series_merged_df[['acnt_id', 'trend_score']]

In [None]:
# 팔로워 참여도
follower_engagement = engaged_df[['acnt_id', 'follower_total_engagement']]

In [None]:
# 팔로워 충성도
follower_loyalty = time_series_merged_df_copy[['acnt_id', 'follower_retention_rate']]

In [None]:
# 콘텐츠 효율성
post_efficiency = post_efficiency_df.copy()

In [None]:
# 광고효율성
revenue_df_total = revenue_df_total.reset_index()
advertisement_efficiency = revenue_df_total[['acnt_id', 'advertisement_efficiency']]

In [None]:
df_list = [creator_activity_score, creator_trend_score, follower_engagement, follower_loyalty, post_efficiency, advertisement_efficiency]

In [None]:
from functools import reduce

flexmatch_score = reduce(lambda left, right: pd.merge(left, right, on='acnt_id', how='left'), df_list)

In [None]:
user_info_nm = user_info_2[['acnt_id', 'acnt_nm']]
flexmatch_score = pd.merge(flexmatch_score, user_info_nm, on='acnt_id')

In [None]:
flexmatch_score = flexmatch_score[['acnt_id', 'acnt_nm', 'avg_upload_interval', 'trend_score', 'follower_total_engagement', 'follower_retention_rate', 'avg_post_efficiency', 'advertisement_efficiency']]
flexmatch_score.rename(columns={
    'avg_upload_interval' : '크리에이터 활동성',
    'trend_score' : '트렌드지수(팔로워순증가량)', 
    'follower_total_engagement' : '콘텐츠 참여도' ,
    'follower_retention_rate' : '팔로워 충성도',
    'avg_post_efficiency' : '콘텐츠 효율성',
    'advertisement_efficiency' : '광고 효율성'
}, inplace=True)

In [None]:
flexmatch_score_2 = flexmatch_score.copy()
flexmatch_score_2.dropna(inplace=True)

In [None]:
flexmatch_score_2

Unnamed: 0,acnt_id,acnt_nm,크리에이터 활동성,트렌드지수(팔로워순증가량),콘텐츠 참여도,팔로워 충성도,콘텐츠 효율성,광고 효율성
0,17841400361359004,s_h_j_,1.166667,0.057866,110.188904,100.0,0.045178,1859.316062
1,17841400561503844,binwoos,0.5,-0.014956,626.6426,99.99,0.16973,518.470975
2,17841400591698216,tingkerhee,2.416667,-0.018509,2065.579146,99.98,0.483403,974.343202
4,17841401506106699,siwolbubu_hyun,0.958333,-0.075045,2094.996055,99.92,0.556736,237.096389
6,17841402936102997,seojinii_,2.875,-0.002312,166.809026,100.0,0.240359,370.279723
10,17841453615191128,bong_camper83,1.5,0.281793,2526.57868,100.0,2.506526,48.672935


In [None]:
# from sklearn.preprocessing import MinMaxScaler

# columns = flexmatch_score_2.select_dtypes(include='float64').columns
# minmax = MinMaxScaler(feature_range=(0, 5))
# flexmatch_score_norm = minmax.fit_transform(flexmatch_score_2[columns])

# flexmatch_score_norm_df = pd.DataFrame(flexmatch_score_norm, columns=columns, index=flexmatch_score_2.index)
# flexmatch_score_norm_df['acnt_id'] = flexmatch_score_2['acnt_id']
# flexmatch_score_norm_df['acnt_nm'] = flexmatch_score_2['acnt_nm']

# flexmatch_score_norm_df['avg_upload_interval'] = 5 - flexmatch_score_norm_df['avg_upload_interval']
# flexmatch_score_norm_df


In [None]:
from sklearn.preprocessing import MinMaxScaler

columns = flexmatch_score_2.select_dtypes(include='float64').columns
minmax = MinMaxScaler(feature_range=(0, 5))
flexmatch_score_norm = minmax.fit_transform(flexmatch_score_2[columns])

flexmatch_score_norm_df = pd.DataFrame(flexmatch_score_norm, columns=columns, index=flexmatch_score_2.index)
flexmatch_score_norm_df['acnt_id'] = flexmatch_score_2['acnt_id']
flexmatch_score_norm_df['acnt_nm'] = flexmatch_score_2['acnt_nm']

flexmatch_score_norm_df['크리에이터 활동성'] = 5 - flexmatch_score_norm_df['크리에이터 활동성']
flexmatch_score_norm_df

# 지금 이거 정규화 할 때는 그냥 이사람들을 전부 다 합쳐서 함께 정규화를 진행해서 그런데, 원래라면은 팔로워를 기준으로 뭐 메가 인플루언서, 마이크로 인플루언서 이런식으로 기준을 나눠서 정규화를 해야 조금 더 정확한 값이 될 것 같음.


Unnamed: 0,크리에이터 활동성,트렌드지수(팔로워순증가량),콘텐츠 참여도,팔로워 충성도,콘텐츠 효율성,광고 효율성,acnt_id,acnt_nm
0,3.596491,1.862339,0.0,5.0,0.0,5.0,17841400361359004,s_h_j_
1,5.0,0.84196,1.068647,4.375,0.253016,1.297324,17841400561503844,binwoos
2,0.964912,0.792182,4.046099,3.75,0.890213,2.556192,17841400591698216,tingkerhee
4,4.035088,0.0,4.106968,0.0,1.039182,0.520322,17841401506106699,siwolbubu_hyun
6,0.0,1.019133,0.117159,5.0,0.396492,0.888101,17841402936102997,seojinii_
10,2.894737,5.0,5.0,5.0,5.0,0.0,17841453615191128,bong_camper83


In [None]:
# flexmatch_score_norm_df.to_csv("flexmatch_score_test.csv")