<a href="https://colab.research.google.com/github/VictoryBeforeFight/KOSA_ML_Project/blob/main/ver0_%E1%84%8B%E1%85%B0%E1%86%B8_%E1%84%80%E1%85%AA%E1%86%BC%E1%84%80%E1%85%A9_%E1%84%8F%E1%85%B3%E1%86%AF%E1%84%85%E1%85%B5%E1%86%A8%E1%84%8B%E1%85%B2%E1%86%AF_%E1%84%8B%E1%85%A8%E1%84%8E%E1%85%B3%E1%86%A8_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 클릭률 예측 AI

- 일시 : 0613-0614
- 훈련 소요 시간 : 1h 33m
- 결측치 : 연속형 = 0 / 범주형 = 1
- 삭제 한 행 : 'ID', 'F11', 'F27', 'F29'
- 인코더 : LabelEncoder
- 언더 샘플링 사용
- RandomForestClassifier : n_estimators=100
- F1 Score : 0.6744817443248701

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
train = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/train.parquet')

In [None]:
#결측치가 있는 열 확인
missing_values = train.isnull().sum()
print(missing_values[missing_values > 0])

F01     1234711
F02     1234711
F03    10543986
F04     5742331
F05     1234711
F10     1234711
F11     2955564
F12     1234711
F15    10543986
F18     7324999
F19     2588853
F20    10543986
F24     8994270
F26    10543986
F27    11063877
F29    11063877
F32      251142
F33     2588853
F34     1234711
F36     7324999
F38      800058
dtype: int64


In [None]:
#결측치 제거
train.drop(columns=['ID', 'F11', 'F27', 'F29'], inplace=True)

In [None]:
#결측치 처리
for column in train.columns:
    if train[column].dtype == 'object':
        train[column].fillna(1, inplace=True)
    else:
        train[column].fillna(0, inplace=True)

In [None]:
# 수치형 데이터 스케일링
sc = StandardScaler()
numeric_features = train.select_dtypes(include=[np.number]).columns
train[numeric_features] = sc.fit_transform(train[numeric_features])

In [None]:
categorical_columns = train.select_dtypes(include=['object']).columns

#타입변경
train[categorical_columns] = train[categorical_columns].astype(str)

#범주형 데이터 인코딩
for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

In [None]:
# 목표 변수와 특성 분리
X = train.drop('Click', axis=1)
y = train['Click']

In [None]:
from imblearn.under_sampling import RandomUnderSampler
y = (y > y.median()).astype(int)
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# 모델 초기화 및 훈련
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# 예측
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
# 성능 평가 보고서 생성
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.67      0.68      0.67   1114620
           1       0.67      0.66      0.67   1113324

    accuracy                           0.67   2227944
   macro avg       0.67      0.67      0.67   2227944
weighted avg       0.67      0.67      0.67   2227944



In [None]:
from sklearn.metrics import roc_auc_score
#AUC 계산
y_true = y_test  # 실제 타깃 값
y_pred = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_pred)
print(f'AUC: {auc:.4f}')

AUC: 0.7357
