<a href="https://colab.research.google.com/github/VictoryBeforeFight/KOSA_ML_Project/blob/main/ver1_%E1%84%8B%E1%85%B0%E1%86%B8_%E1%84%80%E1%85%AA%E1%86%BC%E1%84%80%E1%85%A9_%E1%84%8F%E1%85%B3%E1%86%AF%E1%84%85%E1%85%B5%E1%86%A8%E1%84%85%E1%85%B2%E1%86%AF_%E1%84%8B%E1%85%A8%E1%84%8E%E1%85%B3%E1%86%A8_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 웹 광고 클릭률 예측 AI

- 일시 : 0614-0614
- 훈련 소요 시간 : h m
- 결측치 : 연속형 = 중앙값 / 범주형 = -1
- 삭제 한 행 : 'ID', 'F11', 'F27', 'F29'
- 인코더 : X
- 언더 샘플링 사용
- CatBoostClassifier : iterations=1000, depth=6, learning_rate=0.1
- F1 Score : 0.71
- AUC : 0.7807

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score

In [None]:
train = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/train.parquet')

In [None]:
#결측치 제거
train.drop(columns=['ID', 'F11', 'F27', 'F29'], inplace=True)

In [None]:
# 결측치('ID', 'F11', 'F27', 'F29') 제거 save파일
train.to_parquet('drop_id_11_27_29.parquet')

In [None]:
# 결측치 처리
for column in train.columns:
    if train[column].dtype == 'object':
        # 범주형 결측치-1
        train[column].fillna(-1, inplace=True)
    else:
        #연속형 결측치 중앙값
        train[column].fillna(train[column].median(), inplace=True)

In [None]:
# 목표 변수와 특성 분리
X = train.drop('Click', axis=1)
y = train['Click']

In [None]:
# 범주형 변수 식별
categorical_columns = X.select_dtypes(include=['object']).columns
categorical_features_indices = [X.columns.get_loc(col) for col in categorical_columns]

In [None]:
#언더샘플링
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# CatBoost 모델 초기화 및 훈련
model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', random_seed=42)
model.fit(X_train, y_train, cat_features=categorical_features_indices, verbose=100)

0:	learn: 0.6787517	total: 10.2s	remaining: 2h 49m 55s
100:	learn: 0.5792618	total: 23m 47s	remaining: 3h 31m 42s
200:	learn: 0.5744319	total: 48m 46s	remaining: 3h 13m 54s
300:	learn: 0.5719345	total: 1h 15m 18s	remaining: 2h 54m 52s
400:	learn: 0.5703033	total: 1h 40m 24s	remaining: 2h 29m 59s
500:	learn: 0.5691203	total: 2h 6m 18s	remaining: 2h 5m 48s
600:	learn: 0.5681432	total: 2h 32m 6s	remaining: 1h 40m 58s
700:	learn: 0.5673783	total: 2h 58m 30s	remaining: 1h 16m 8s
800:	learn: 0.5667088	total: 3h 24m 41s	remaining: 50m 51s
900:	learn: 0.5661483	total: 3h 50m 58s	remaining: 25m 22s
999:	learn: 0.5656657	total: 4h 17m 20s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7e0a11440520>

In [None]:
# 예측
y_pred = model.predict(X_test)

In [None]:
# 성능 평가 보고서 생성
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      0.71      0.71   1114620
           1       0.71      0.71      0.71   1113324

    accuracy                           0.71   2227944
   macro avg       0.71      0.71      0.71   2227944
weighted avg       0.71      0.71      0.71   2227944



In [None]:
from sklearn.metrics import roc_auc_score
#AUC 계산
y_true = y_test  # 실제 타깃 값
y_pred = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_pred)
print(f'AUC: {auc:.4f}')

AUC: 0.7807
