In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# 데이터 불러오기
df = pd.read_csv('train.csv')
# 컬럼명 변경
df = df.rename(columns={'id': 'idx', 'label': 'target'})

# 데이터 선택
X = df['conversation']
y = df['target']

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 벡터화
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# 예측 및 평가
y_pred = model.predict(X_val_vec)

# 정확도
print('Validation Accuracy:', accuracy_score(y_val, y_pred))

# F-1 스코어 (macro 평균)
f1_macro = f1_score(y_val, y_pred, average='macro')
print('Validation F1 Macro:', f1_macro)

# 클래스별 F-1 스코어
f1_per_class = f1_score(y_val, y_pred, average=None)
print('F1 per class:')
for idx, score in enumerate(f1_per_class):
    print(f'Class {idx}: {score:.4f}')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalMaxPool1D
from tensorflow.keras.models import Model

# 데이터 준비
df = pd.read_csv('train.csv')

# 컬럼명 변경
df = df.rename(columns={'id': 'idx', 'label': 'target'})

X = df['conversation']
y = df['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

# 1. 텍스트 벡터화
VOCAB_SIZE = 20000
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=128
)
vectorize_layer.adapt(X_train)

# 2. 트랜스포머 블록
def transformer_encoder(inputs, head_size=64, num_heads=4, ff_dim=128):
    # Multi-head attention
    x = tf.keras.layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads
    )(inputs, inputs)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + inputs)

    # Feed forward
    y = tf.keras.layers.Dense(ff_dim, activation="relu")(x)
    y = tf.keras.layers.Dense(inputs.shape[-1])(y)
    y = tf.keras.layers.Dropout(0.3)(y)
    return tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + y)

# 3. 모델 구성
inputs = Input(shape=(1,), dtype=tf.string)
x = vectorize_layer(inputs)
x = tf.keras.layers.Embedding(VOCAB_SIZE, 128)(x)
x = transformer_encoder(x)
x = GlobalMaxPool1D()(x)
outputs = Dense(4, activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 4. F1-score 콜백
class F1Callback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        y_pred = model.predict(X_val).argmax(axis=1)
        f1 = f1_score(y_val, y_pred, average='macro')
        print(f'\nVal F1: {f1:.4f}')

# 5. 학습
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
    callbacks=[F1Callback()]
)


In [None]:
import pandas as pd
import glob

# 1. train 폴더 안의 모든 .tsv 파일 경로 가져오기
file_list = glob.glob('train/*.tsv')  # 또는 './train/*.tsv'

# 2. 각 파일을 DataFrame으로 읽어 리스트에 저장
dfs = [pd.read_csv(f, sep='\t') for f in file_list]

# 3. 데이터프레임 합치기
merged = pd.concat(dfs, ignore_index=True)

# 4. CSV로 저장
merged.to_csv('train_merged.csv', index=False, encoding='utf-8')
