<a href="https://colab.research.google.com/github/VictoryBeforeFight/KOSA_ML_Project/blob/main/3%EC%B0%A8_%EC%A4%91%EC%95%99_%EC%B5%9C%EB%B9%88_%EB%AA%A8%EB%8D%B8%EB%A7%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from sklearn.preprocessing import StandardScaler, LabelEncoder

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# 데이터 로드
train = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/train.parquet')

#결측치 제거
train.drop(columns=['ID', 'F11', 'F27', 'F29'], inplace=True)

# 결측치 처리
for column in train.columns:
    if train[column].dtype == 'object':
        # 범주형 변수의 결측치는 중앙값으로 대체
        train[column].fillna(train[column].mode()[0], inplace=True)
    else:
        # 연속형 변수의 결측치는 최빈값으로 대체
        train[column].fillna(train[column].median(), inplace=True)

# 수치형 데이터 스케일링
sc = StandardScaler()
numeric_features = train.select_dtypes(include=[np.number]).columns
train[numeric_features] = sc.fit_transform(train[numeric_features])

categorical_columns = train.select_dtypes(include=['object']).columns

#타입변경
train[categorical_columns] = train[categorical_columns].astype(str)

#범주형 데이터 인코딩
for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

# 데이터 타입 최적화 (float64 -> float32)
for col in train.select_dtypes(include=['float64']).columns:
    train[col] = train[col].astype('float32')

# 목표 변수와 특성 분리
X = train.drop('Click', axis=1)
y = train['Click']

# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# 오버샘플링 (SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 분할된 데이터 저장
X_train.to_parquet('X_train.parquet')
X_test.to_parquet('X_test.parquet')
y_train.to_parquet('y_train.parquet')
y_test.to_parquet('y_test.parquet')

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns.tolist())
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_columns.tolist(), reference=train_data)

# LightGBM 모델 훈련
model = lgb.train(params, train_data, valid_sets=[train_data, test_data], num_boost_round=500, early_stopping_rounds=50, verbose_eval=100)

# 최종 모델 저장
model.save_model('lightgbm_model.txt')

# 예측
y_pred = model.predict(X_test)
y_pred_binary = [1 if x >= 0.5 else 0 for x in y_pred]


In [None]:
# 성능 평가 보고서 생성 및 출력
report = classification_report(y_test, y_pred_binary)
print("Classification Report:")
print(report)

In [None]:
# 혼동 행렬 출력
cm = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:")
print(cm)