In [13]:
import pandas as pd
import numpy as np
import os
from encoding_function import low_frequency_to_others
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [14]:
def add_y(df):
    df['rank'] = df['rank'].replace(0, 6)

    target_cols = [
        '단승', '복승', '삼복승'
    ]
    for i, col in enumerate(target_cols):
        condition_target = df['rank'] <= i+1
        df.loc[condition_target, col] = 1

    df[target_cols] = df[target_cols].fillna(0)
    df.drop(['rank'], axis=1, inplace=True)
    return df


ROOT_DIR = "data"
RANDOM_STATE = 42

train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

train['성별'] = train['성별'].map({'남': 0, '여': 1})
test['성별'] = test['성별'].map({'남': 0, '여': 1})

train = add_y(train)
test = add_y(test)

print(train.shape, test.shape)
train.head()

(79950, 45) (6102, 45)


Unnamed: 0,Race_ID,연도,회차,일차,경주번호,번호,등급,기수,선수명,성별,...,모터_연대율3,전탑승선수1,전탑승선수2,보트번호,보트_평균착순점,보트_연대율,특이사항,단승,복승,삼복승
0,2013_1_1_1,2013,1,1,1,1,A2,2,이용세,0,...,46.3,김신오/25,김선필/25,10,5.6,32.8,,0.0,1.0,1.0
1,2013_1_1_1,2013,1,1,1,2,B1,7,손동민,0,...,70.9,김승택/3,경상수/1316,68,5.59,40.7,,0.0,0.0,1.0
2,2013_1_1_1,2013,1,1,1,3,B2,11,박진서,0,...,55.7,송종해/45,문주엽/4,22,5.07,30.7,,0.0,0.0,0.0
3,2013_1_1_1,2013,1,1,1,4,B1,4,경상수,0,...,75.0,이종한/4,문주엽/223,77,5.49,39.7,,1.0,1.0,1.0
4,2013_1_1_1,2013,1,1,1,5,B1,8,정주현,0,...,22.4,이창규/6,반혜진/545,18,5.69,37.6,,0.0,0.0,0.0


In [15]:
def add_weight_penalty(df):
    weight_limit_male = 55
    weight_limit_female = 51
    df['체중'] = df['체중'].astype(float)
    
    # 중량 부과 여부 계산 (성별에 따라 다른 기준 적용)
    df['중량부과여부'] = df.apply(lambda row: 1 if (row['성별'] == 0 and row['체중'] < weight_limit_male) or
                                             (row['성별'] == 1 and row['체중'] < weight_limit_female) else 0, axis=1)

    # 부과된 중량 계산 (성별에 따라 차감된 체중을 계산)
    df['부과된중량'] = df.apply(lambda row: max(0, weight_limit_male - row['체중']) if row['성별'] == 0 else
    max(0, weight_limit_female - row['체중']), axis=1)
    
    df.drop('체중', axis=1, inplace=True)

    return df

train = add_weight_penalty(train)
test = add_weight_penalty(test)

In [16]:
def bin_age(df):
    # 나이를 5개 구간으로 나누어 bin화 (예: 0-20, 20-30, 30-40, 40-50, 50+)
    bins = [20, 30, 40, 50, 100]
    labels = ['20-30', '30-40', '40-50', '50+']

    # 나이 변수를 bin화
    df['나이'] = pd.cut(df['나이'], bins=bins, labels=labels, right=False)

    return df

train = bin_age(train)
test = bin_age(test)

In [17]:
def drop_columns_from_datasets(df):
    drop_cols = [
        '연도', '회차', '일차', '경주번호',
        '금일출주경주',
        '모터번호', '전탑승선수1', '전탑승선수2',
        '보트번호', '특이사항',
        'FL' # F뒤의 숫자는 현재 반기의 실격 횟수, L뒤의 숫자는 반칙횟수?
    ]

    df = df.drop(drop_cols, axis=1)

    return df


train = drop_columns_from_datasets(train)
test = drop_columns_from_datasets(test)

print(train.shape, test.shape)

(79950, 35) (6102, 35)


In [18]:
def reverse_rank_values(df_train, df_val):
    cols_to_reverse = [
        '최근6회차_평균착순점', '최근6회차_평균득점',
        '연간성적_평균착순점',
        '모터_평균착순점',
        '보트_평균착순점'
    ]
    for col in cols_to_reverse:
        max_rank = df_train[col].max()
        # print(f'{col}: {max_rank}')
        df_train[col] = max_rank - df_train[col]
        df_val[col] = max_rank - df_val[col]
    
        df_train[col].fillna(0, inplace=True)
        df_val[col].fillna(0, inplace=True)
    
    return df_train, df_val


train, test = reverse_rank_values(train, test)

In [19]:
import re

def extract_numbers(result):
    if isinstance(result, str):  # result가 문자열인 경우에만 처리
        return re.findall(r'-(\d+)-', result)  # -숫자- 형식에서 중간 숫자 추출
    return []  # 문자열이 아니면 빈 리스트 반환

def calculate_mean(numbers):
    numbers = [int(num) for num in numbers]  # 리스트 안의 문자열 숫자를 정수로 변환
    if len(numbers) > 0:
        return np.mean(numbers)  # 리스트가 비어있지 않으면 평균 계산
    else:
        return np.nan  # 빈 리스트인 경우 NaN 반환

def last_race_process(df_train, df_val):
    df_train['전일성적'] = df_train['전일성적'].apply(extract_numbers)  # 순위만 추출 (코스도 같이 추출?)
    df_train['전일성적'] = df_train['전일성적'].apply(calculate_mean) # 평균 계산

    df_val['전일성적'] = df_val['전일성적'].apply(extract_numbers)
    df_val['전일성적'] = df_val['전일성적'].apply(calculate_mean)

    max_rank = df_train['전일성적'].max()
    df_train['전일성적'] = max_rank - df_train['전일성적'] + 1
    df_val['전일성적'] = max_rank - df_val['전일성적'] + 1
    
    df_train['전일성적'].fillna(1, inplace=True)
    df_val['전일성적'].fillna(1, inplace=True)
    
    return df_train, df_val

train, test = last_race_process(train, test)

In [20]:
def separation_course(df):
    col_list = [
        '코스_1코스', '코스_2코스', '코스_3코스', '코스_4코스', '코스_5코스', '코스_6코스'
    ]
    for col in col_list:
        df[[f'{col[3:]}_성적', f'{col[3:]}_경기수']] = df[col].fillna('').str.split('/', expand=True)

    df.drop(col_list, axis=1, inplace=True)

    return df
        


train = separation_course(train)
test = separation_course(test)

print(train.shape, test.shape)

(79950, 41) (6102, 41)


In [21]:
def apply_laplace_smoothing(df, col, global_mean, alpha):
    # 경기수 0인 값이 너무 높게 나오는 경향이 있어 분모에 상수 1 추가(없애도 됨)
    encoded_value = (df[f'{col}_성적'] * df[f'{col}_경기수'] + global_mean * alpha) / (1 + df[f'{col}_경기수'] + alpha)
    df[f'{col}_성적'] = encoded_value

    return df

def laplace_smoothing_to_course(train, val=None, alpha=1):
    col_list = [
        '1코스', '2코스', '3코스', '4코스', '5코스', '6코스'
    ]
    for col in col_list:
        train[f'{col}_성적'] = train[f'{col}_성적'].astype(float)
        train[f'{col}_경기수'] = train[f'{col}_경기수'].astype(float)
        if val is not None:
            val[f'{col}_성적'] = val[f'{col}_성적'].astype(float)
            val[f'{col}_경기수'] = val[f'{col}_경기수'].astype(float)

    # Train 데이터에서 글로벌 평균 계산
    global_means = {col: train[f'{col}_성적'].mean() for col in col_list}

    for col in col_list:
        # Train 데이터에 라플라스 스무딩 적용
        train = apply_laplace_smoothing(train, col, global_means[col], alpha)
        train.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        for col in col_list:
            # Validation 데이터에 Train에서 구한 글로벌 평균으로 라플라스 스무딩 적용
            val = apply_laplace_smoothing(val, col, global_means[col], alpha)
            val.drop(f'{col}_경기수', axis=1, inplace=True)

    if val is not None:
        return train, val
    else:
        return train


train, test = laplace_smoothing_to_course(train, test, alpha=1) # 알파가 작을수록 빈도수에 가깝세, 알파가 클수록 전체 평균에 가깝게
print(train.shape, test.shape)

(79950, 35) (6102, 35)


In [22]:
def set_course_scores(df):
    # '코스_성적' 열을 초기화
    df['코스_성적'] = 0

    # '번호' 열에 따른 '코스_성적' 값 설정
    for i in range(1, 7):
        mask = df['번호'] == i
        df.loc[mask, '코스_성적'] = df.loc[mask, f'{i}코스_성적']

    # 제거할 열 목록
    drop_cols = [f'{i}코스_성적' for i in range(1, 7)]

    # 열 제거
    df = df.drop(columns=drop_cols)

    return df


from sklearn.decomposition import PCA
def apply_pca(df_train, df_val, n_components=2):
    course_cols = [f'{i}코스_성적' for i in range(1, 7)]

    pca = PCA(n_components=n_components)
    pca.fit(df_train[course_cols])

    # 훈련 데이터에 PCA 변환 적용 (transform)
    train_pca = pca.transform(df_train[course_cols])
    val_pca = pca.transform(df_val[course_cols])
    for i in range(n_components):
        col_name = f'PCA_코스성적_{i+1}'
        df_train[col_name] = train_pca[:, i]
        df_val[col_name] = val_pca[:, i]

    # 각 컴포넌트별 분산 설명 비율 출력
    # explained_variance_ratios = pca.explained_variance_ratio_
    # for i, ratio in enumerate(explained_variance_ratios):
    #     print(f"PCA Component {i+1}: {ratio:.4f} variance explained")
    
    # n_components=2가 제일 좋아보임

    return df_train, df_val


train, test = apply_pca(train, test, n_components=2)
train = set_course_scores(train)
test = set_course_scores(test)

In [23]:
def split_last_eight_rank(df):
    for i in range(0, 4):
        df[f'최근{i+1}경기_착순'] = df['최근8경주_착순'].str[i]
    for j in range(5, 9):
        df[f'최근{j}경기_착순'] = df['최근8경주_착순'].str[j]
        
    df.drop('최근8경주_착순', axis=1, inplace=True)
    
    return df

# def adjust_for_top3(df):
#     col_list = [
#         '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
#         '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
#     ]
# 
#     for col in col_list:
#     # 순위가 1, 2, 3이 아닌 경우, 결측인 경우, 6으로 조정
#     # (일반화된 성능을 위해 + 3등내에 드는게 중요)
#         df[col] = df[col].apply(lambda x: x if x in ['1', '2', '3'] else '-1')
# 
#     return df

def adjust_last_eight_rank(df_train, df_val):
    col_list = [
        '최근1경기_착순', '최근2경기_착순', '최근3경기_착순', '최근4경기_착순',
        '최근5경기_착순', '최근6경기_착순', '최근7경기_착순', '최근8경기_착순'
    ]

    for col in col_list:
        df_train[col] = df_train[col].fillna(6).astype(int)
        df_val[col] = df_val[col].fillna(6).astype(int)
        
        df_train[col] = df_train[col].replace(0, 6)  # 가끔씩 0이 있는 경우가 존재
        df_val[col] = df_val[col].replace(0, 6)

        max_rank = df_train[col].max()
        df_train[col] = max_rank - df_train[col]
        df_val[col] = max_rank - df_val[col]
    
    first_mean_cols = 3
    col_to_mean_1 = col_list[:first_mean_cols]

    df_train[f'최근{first_mean_cols}경기_평균'] = df_train[col_to_mean_1].mean(axis=1)
    df_val[f'최근{first_mean_cols}경기_평균'] = df_val[col_to_mean_1].mean(axis=1)

    df_train.drop(col_list, axis=1, inplace=True)
    df_val.drop(col_list, axis=1, inplace=True)

    return df_train, df_val


train = split_last_eight_rank(train)
# train = adjust_for_top3(train)

test = split_last_eight_rank(test)
# test = adjust_for_top3(test)

train, test = adjust_last_eight_rank(train, test)

print(train.shape, test.shape)

(79950, 32) (6102, 32)


In [24]:
def cal_cat_cols(train, val=None):
    objective_cols = []

    # 특정 문자열이 포함된 열을 범주형 변수로 지정
    # cat_kerword_list = ['번호', '기수', '경기_착순']
    cat_kerword_list = ['번호']
    for col in train.columns:
        if any(sub in col for sub in cat_kerword_list):
            objective_cols.append(col)
            train[col] = train[col].astype('str')
            val[col] = val[col].astype('str')
            
    # 나머지 열에 대해 숫자형 변환 시도
    for col in train.columns:
        if col in objective_cols:
            continue  # 이미 범주형으로 처리된 열은 제외
        try:
            # 'float' 타입으로 변환 시도
            train[col] = train[col].astype('float')
            val[col] = val[col].astype('float')
        except:
            objective_cols.append(col)

    cat_features = list(set(objective_cols) - set(['rank', 'Race_ID']))
    cat_features = [feature for feature in cat_features if '단승' not in feature]
    cat_features = [feature for feature in cat_features if '복승' not in feature]
    cat_features = [feature for feature in cat_features if '삼복승' not in feature]

    return cat_features


cat_features = cal_cat_cols(train, test)
cat_features

['등급', '번호', '선수명', '나이']

In [25]:
drop_cols = [
    'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수'
]

X_train = train.drop(drop_cols, axis=1)
y_train = train[['단승']]
X_test = test.drop(drop_cols, axis=1)
y_test = test[['단승']]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(79950, 25) (79950, 1) (6102, 25) (6102, 1)


In [68]:
def all_precoess(train, val, target='복승', is_train=True):
    train['성별'] = train['성별'].map({'남': 0, '여': 1})
    val['성별'] = val['성별'].map({'남': 0, '여': 1})
    
    train = add_y(train)
    if is_train:
        val = add_y(val)

    train = add_weight_penalty(train) # 연승 정확도를 높이려면 제거
    val = add_weight_penalty(val) 

    train = bin_age(train)
    val = bin_age(val)
    
    train = drop_columns_from_datasets(train)
    val = drop_columns_from_datasets(val)

    train, val = last_race_process(train, val)

    train, val = reverse_rank_values(train, val)

    train = separation_course(train)
    val = separation_course(val)

    train, val = laplace_smoothing_to_course(train, val, alpha=1)

    train, val = apply_pca(train, val, n_components=2)
    
    train = set_course_scores(train)
    val = set_course_scores(val)
    
    train = split_last_eight_rank(train)
    # train = adjust_for_top3(train)
    
    val = split_last_eight_rank(val)
    # val = adjust_for_top3(val)

    train, val = adjust_last_eight_rank(train, val)

    if is_train:
        drop_cols = [
            'Race_ID', '번호', '단승', '복승', '삼복승', '선수명', '기수'
        ]
    else:
        drop_cols = [
            '번호', '선수명', '기수'
        ]
    
    X_train = train.drop(drop_cols, axis=1)
    y_train = train[[target]]
    # X_train = reshape_race_data(X_train, players_per_race=6)
    # y_train = reshape_race_data(y_train, players_per_race=6)
    y_train = np.array(y_train).astype(int)

    X_val = val.drop(drop_cols, axis=1)
    # X_val = reshape_race_data(X_val, players_per_race=6)
    if is_train:
        y_val = val[[target]]
        # y_val = reshape_race_data(y_val, players_per_race=6)
        y_val = np.array(y_val).astype(int)

    # X_train, X_val = low_to_others(X_train, X_val, threshold=5, is_train=is_train, verbose=False)
    
    if is_train:
        return X_train, y_train, X_val, y_val
    else:
        return X_train, y_train, X_val
    
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

X_train, y_train, X_test, y_test = all_precoess(train, test, is_train=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(79950, 25) (79950, 1) (6102, 25) (6102, 1)


In [69]:
train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
val = pd.read_csv(os.path.join(ROOT_DIR, "val.csv"))

In [70]:
target_value='복승'
X_train, y_train, X_val, y_val = all_precoess(train, val, target=target_value, is_train=True)

In [71]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(79950, 25) (79950, 1) (8952, 25) (8952, 1)


In [72]:
# Count the number of 1s
count_of_ones = np.count_nonzero(y_train == 1)

print(count_of_ones)

26650


In [73]:
X_train['result'] = y_train
X_val['result'] = y_val
X_test['result'] = y_test

In [74]:
data = X_train

In [75]:
grouped_data = [data.iloc[i:i+6] for i in range(66000, len(data), 6)]

In [76]:
len(grouped_data)

2325

In [77]:
X_train.columns

Index(['등급', '성별', '나이', '최근6회차_평균착순점', '최근6회차_평균득점', '최근6회차_승률', '최근6회차_연대율2',
       '최근6회차_연대율3', '최근6회차_평균ST', '연간성적_평균착순점', '연간성적_연대율', '평균사고점', '전일성적',
       '출주횟수', '모터_평균착순점', '모터_연대율2', '모터_연대율3', '보트_평균착순점', '보트_연대율',
       '중량부과여부', '부과된중량', 'PCA_코스성적_1', 'PCA_코스성적_2', '코스_성적', '최근3경기_평균',
       'result'],
      dtype='object')

In [101]:
# Adjust the function to correctly determine the winning player index within each group
def create_jsonl_format_grouped(group):
    # Constructing the user input prompt with relevant player information for a group of 6
    user_input = " ".join([
        f"선수{i+1}: 등급: {row['등급']}, 성별: {row['성별']}, 나이: {row['나이']}, "
        f"최근6회차_평균착순점: {row['최근6회차_평균착순점']:.2f}, 최근6회차_평균득점: {row['최근6회차_평균득점']:.2f}, "
        f"최근6회차_승률: {row['최근6회차_승률']}, 최근6회차_연대율2: {row['최근6회차_연대율2']}, "
        f"최근6회차_연대율3: {row['최근6회차_연대율3']}, 최근6회차_평균ST: {row['최근6회차_평균ST']}, "
        f"연간성적_평균착순점: {row['연간성적_평균착순점']}, 연간성적_연대율: {row['연간성적_연대율']}, "
        f"평균사고점: {row['평균사고점']}, 전일성적: {row['전일성적']}, 출주횟수: {row['출주횟수']}, "
        f"모터_평균착순점: {row['모터_평균착순점']:.2f}, 모터_연대율2: {row['모터_연대율2']}, "
        f"모터_연대율3: {row['모터_연대율3']}, 보트_평균착순점: {row['보트_평균착순점']:.2f}, "
        f"보트_연대율: {row['보트_연대율']}, 중량부과여부: {row['중량부과여부']}, 부과된중량: {row['부과된중량']}, "
        f"PCA_코스성적_1: {row['PCA_코스성적_1']:.2f}, PCA_코스성적_2: {row['PCA_코스성적_2']:.2f}, "
        f"코스_성적: {row['코스_성적']:.2f}, 최근3경기_평균: {row['최근3경기_평균']:.2f}"
        for i, (_, row) in enumerate(group.iterrows())
    ])


    winning_players_indices = (group[group['result'] == 1].index % 6 + 1).tolist()
    assistant_output = f"우승한 선수의 번호는 {', '.join(map(str, winning_players_indices))}번 입니다."
    
    jsonl_entry = {
        "messages": [
            {"role": "system", "content": "당신은 유능한 순위예측 전문가입니다."},
            {"role": "user", "content": user_input.strip()},
            {"role": "assistant", "content": assistant_output.strip()}
        ]
    }
    return jsonl_entry

In [102]:
jsonl_data_grouped = [create_jsonl_format_grouped(group) for group in grouped_data if len(group) == 6]

In [103]:
jsonl_data_grouped[0]['messages']

[{'role': 'system', 'content': '당신은 유능한 순위예측 전문가입니다.'},
 {'role': 'user',
  'content': '선수1: 등급: B1, 성별: 1, 나이: 30-40, 최근6회차_평균착순점: 5.13, 최근6회차_평균득점: 5.47, 최근6회차_승률: 6.7, 최근6회차_연대율2: 46.7, 최근6회차_연대율3: 53.3, 최근6회차_평균ST: 0.23, 연간성적_평균착순점: 6.25, 연간성적_연대율: 33.3, 평균사고점: 0.42, 전일성적: 1.0, 출주횟수: 22, 모터_평균착순점: 6.06, 모터_연대율2: 25.0, 모터_연대율3: 43.1, 보트_평균착순점: 4.11, 보트_연대율: 52.9, 중량부과여부: 1, 부과된중량: 2.0, PCA_코스성적_1: -4.84, PCA_코스성적_2: 17.20, 코스_성적: 25.15, 최근3경기_평균: 2.67 선수2: 등급: B2, 성별: 1, 나이: 20-30, 최근6회차_평균착순점: 6.43, 최근6회차_평균득점: 6.81, 최근6회차_승률: 0.0, 최근6회차_연대율2: 12.5, 최근6회차_연대율3: 18.8, 최근6회차_평균ST: 0.24, 연간성적_평균착순점: 7.55, 연간성적_연대율: 7.7, 평균사고점: 0.31, 전일성적: 1.0, 출주횟수: 29, 모터_평균착순점: 5.63, 모터_연대율2: 34.7, 모터_연대율3: 45.9, 보트_평균착순점: 6.58, 보트_연대율: 23.9, 중량부과여부: 1, 부과된중량: 10.0, PCA_코스성적_1: -45.46, PCA_코스성적_2: -1.69, 코스_성적: 6.71, 최근3경기_평균: 2.33 선수3: 등급: B1, 성별: 0, 나이: 50+, 최근6회차_평균착순점: 5.25, 최근6회차_평균득점: 5.25, 최근6회차_승률: 6.3, 최근6회차_연대율2: 25.0, 최근6회차_연대율3: 50.0, 최근6회차_평균ST: 0.21, 연간성적_평균착순점: 5.84, 연간성적_연대율: 24.1, 평

In [104]:
output_file_path_grouped_corrected = 'data/train_double_smaller.jsonl'
with open(output_file_path_grouped_corrected, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_grouped:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

In [105]:
data_val = X_val

grouped_data_val = [data_val.iloc[i:i+6] for i in range(6000, len(data_val), 6)]

jsonl_data_grouped_val = [create_jsonl_format_grouped(group) for group in grouped_data_val if len(group) == 6]

output_file_path_grouped_corrected = 'data/val_double_smaller.jsonl'
with open(output_file_path_grouped_corrected, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_grouped_val:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

In [106]:
def create_jsonl_format_grouped_test(group):
    user_input = " ".join([
        f"선수{i+1}: 등급: {row['등급']}, 성별: {row['성별']}, 나이: {row['나이']}, "
        f"최근6회차_평균착순점: {row['최근6회차_평균착순점']:.2f}, 최근6회차_평균득점: {row['최근6회차_평균득점']:.2f}, "
        f"최근6회차_승률: {row['최근6회차_승률']}, 최근6회차_연대율2: {row['최근6회차_연대율2']}, "
        f"최근6회차_연대율3: {row['최근6회차_연대율3']}, 최근6회차_평균ST: {row['최근6회차_평균ST']}, "
        f"연간성적_평균착순점: {row['연간성적_평균착순점']}, 연간성적_연대율: {row['연간성적_연대율']}, "
        f"평균사고점: {row['평균사고점']}, 전일성적: {row['전일성적']}, 출주횟수: {row['출주횟수']}, "
        f"모터_평균착순점: {row['모터_평균착순점']:.2f}, 모터_연대율2: {row['모터_연대율2']}, "
        f"모터_연대율3: {row['모터_연대율3']}, 보트_평균착순점: {row['보트_평균착순점']:.2f}, "
        f"보트_연대율: {row['보트_연대율']}, 중량부과여부: {row['중량부과여부']}, 부과된중량: {row['부과된중량']}, "
        f"PCA_코스성적_1: {row['PCA_코스성적_1']:.2f}, PCA_코스성적_2: {row['PCA_코스성적_2']:.2f}, "
        f"코스_성적: {row['코스_성적']:.2f}, 최근3경기_평균: {row['최근3경기_평균']:.2f}"
        for i, (_, row) in enumerate(group.iterrows())
    ])
    
    jsonl_entry = {
        "messages": [
            {"role": "system", "content": "당신은 유능한 순위예측 전문가입니다."},
            {"role": "user", "content": user_input.strip()},
        ]
    }
    return jsonl_entry

In [107]:
grouped_data_test = [X_test.iloc[i:i+6] for i in range(0, len(X_test), 6)]

In [108]:
jsonl_data_grouped_test = [create_jsonl_format_grouped_test(group) for group in grouped_data_test if len(group) == 6]

In [109]:
jsonl_data_grouped_test[0]['messages']

[{'role': 'system', 'content': '당신은 유능한 순위예측 전문가입니다.'},
 {'role': 'user',
  'content': '선수1: 등급: B1, 성별: 0, 나이: 30-40, 최근6회차_평균착순점: 4.35, 최근6회차_평균득점: 5.24, 최근6회차_승률: 17.6, 최근6회차_연대율2: 35.3, 최근6회차_연대율3: 64.7, 최근6회차_평균ST: 0.2, 연간성적_평균착순점: 10.67, 연간성적_연대율: 0.0, 평균사고점: 0.0, 전일성적: 1.0, 출주횟수: 30, 모터_평균착순점: 5.69, 모터_연대율2: 29.7, 모터_연대율3: 47.9, 보트_평균착순점: 5.17, 보트_연대율: 39.1, 중량부과여부: 0, 부과된중량: 0.0, PCA_코스성적_1: -5.40, PCA_코스성적_2: -27.50, 코스_성적: 64.41, 최근3경기_평균: 2.67 선수2: 등급: B1, 성별: 0, 나이: 50+, 최근6회차_평균착순점: 4.77, 최근6회차_평균득점: 4.89, 최근6회차_승률: 14.3, 최근6회차_연대율2: 35.7, 최근6회차_연대율3: 42.9, 최근6회차_평균ST: 0.33, 연간성적_평균착순점: 10.67, 연간성적_연대율: 0.0, 평균사고점: 0.0, 전일성적: 1.0, 출주횟수: 35, 모터_평균착순점: 5.18, 모터_연대율2: 38.7, 모터_연대율3: 54.0, 보트_평균착순점: 6.40, 보트_연대율: 22.6, 중량부과여부: 1, 부과된중량: 1.0, PCA_코스성적_1: -4.34, PCA_코스성적_2: -5.20, 코스_성적: 48.61, 최근3경기_평균: 1.33 선수3: 등급: B1, 성별: 0, 나이: 40-50, 최근6회차_평균착순점: 5.85, 최근6회차_평균득점: 5.95, 최근6회차_승률: 5.3, 최근6회차_연대율2: 10.5, 최근6회차_연대율3: 36.8, 최근6회차_평균ST: 0.2, 연간성적_평균착순점: 10.67, 연간성적_연대율: 0.0, 평균

In [110]:
output_file_path_grouped = 'data/test_double.jsonl'
with open(output_file_path_grouped, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_grouped_test:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

### batch6으로 하기 위한 프롬프트

In [111]:
def create_jsonl_format_single_player(player):
    # Constructing the user input prompt with relevant player information for a single player
    user_input = (
        f"선수: 등급: {player['등급']}, 성별: {player['성별']}, 나이: {player['나이']}, "
        f"최근6회차_평균착순점: {player['최근6회차_평균착순점']:.2f}, 최근6회차_평균득점: {player['최근6회차_평균득점']:.2f}, "
        f"최근6회차_승률: {player['최근6회차_승률']}, 최근6회차_연대율2: {player['최근6회차_연대율2']}, "
        f"최근6회차_연대율3: {player['최근6회차_연대율3']}, 최근6회차_평균ST: {player['최근6회차_평균ST']}, "
        f"연간성적_평균착순점: {player['연간성적_평균착순점']}, 연간성적_연대율: {player['연간성적_연대율']}, "
        f"평균사고점: {player['평균사고점']}, 전일성적: {player['전일성적']}, 출주횟수: {player['출주횟수']}, "
        f"모터_평균착순점: {player['모터_평균착순점']:.2f}, 모터_연대율2: {player['모터_연대율2']}, "
        f"모터_연대율3: {player['모터_연대율3']}, 보트_평균착순점: {player['보트_평균착순점']:.2f}, "
        f"보트_연대율: {player['보트_연대율']}, 중량부과여부: {player['중량부과여부']}, 부과된중량: {player['부과된중량']}, "
        f"PCA_코스성적_1: {player['PCA_코스성적_1']:.2f}, PCA_코스성적_2: {player['PCA_코스성적_2']:.2f}, "
        f"코스_성적: {player['코스_성적']:.2f}, 최근3경기_평균: {player['최근3경기_평균']:.2f}"
    )

    # Determining if the player won or lost based on 'result'
    if player['result'] == 1:
        assistant_output = "우승"
    else:
        assistant_output = "실패"
    
    # Creating the jsonl entry
    jsonl_entry = {
        "messages": [
            {"role": "system", "content": "당신은 유능한 순위예측 전문가입니다."},
            {"role": "user", "content": user_input.strip()},
            {"role": "assistant", "content": assistant_output.strip()}
        ]
    }
    return jsonl_entry


In [122]:
jsonl_data_single_player_val_1 = [create_jsonl_format_single_player(X_train.iloc[i]) for i in range(66000, len(X_train))]

output_file_path_single_player_corrected = 'data/train_double_smaller_v2.jsonl'

# Writing the JSONL entries to the output file
with open(output_file_path_single_player_corrected, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_single_player_val:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

In [121]:
jsonl_data_single_player_val_2 = [create_jsonl_format_single_player(X_val.iloc[i]) for i in range(6000, len(X_val))]

output_file_path_single_player_corrected = 'data/val_double_smaller_v2.jsonl'

# Writing the JSONL entries to the output file
with open(output_file_path_single_player_corrected, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_single_player_val:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

In [120]:
jsonl_data_single_player_val_3 = [create_jsonl_format_single_player(X_test.iloc[i]) for i in range(len(X_test))]

output_file_path_single_player_corrected = 'data/test_double_smaller_v2.jsonl'

# Writing the JSONL entries to the output file
with open(output_file_path_single_player_corrected, 'w', encoding='utf-8') as f:
    for entry in jsonl_data_single_player_val:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

IndexError: single positional indexer is out-of-bounds