In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# 데이터 불러오기
df = pd.read_csv('train.csv')
# 컬럼명 변경
df = df.rename(columns={'id': 'idx', 'label': 'target'})

# 데이터 선택
X = df['conversation']
y = df['target']

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 벡터화
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# 예측 및 평가
y_pred = model.predict(X_val_vec)

# 정확도
print('Validation Accuracy:', accuracy_score(y_val, y_pred))

# F-1 스코어 (macro 평균)
f1_macro = f1_score(y_val, y_pred, average='macro')
print('Validation F1 Macro:', f1_macro)

# 클래스별 F-1 스코어
f1_per_class = f1_score(y_val, y_pred, average=None)
print('F1 per class:')
for idx, score in enumerate(f1_per_class):
    print(f'Class {idx}: {score:.4f}')
