<a href="https://colab.research.google.com/github/VictoryBeforeFight/KOSA_ML_Project/blob/main/1st_no.4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

train = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/train.parquet')

#결측치가 있는 열 확인
missing_values = train.isnull().sum()
print(missing_values[missing_values > 0])

#곃측치 제거
train.drop(columns=['ID', 'F11', 'F27', 'F28'], inplace=True)

#결측치 처리
for column in train.columns:
    if train[column].dtype == 'object':
        train[column].fillna(1, inplace=True)
    else:
        train[column].fillna(0, inplace=True)

# 수치형 데이터 스케일링
sc = StandardScaler()
numeric_features = train.select_dtypes(include=[np.number]).columns
train[numeric_features] = sc.fit_transform(train[numeric_features])

categorical_columns = train.select_dtypes(include=['object']).columns

#타입변경
train[categorical_columns] = train[categorical_columns].astype(str)

#범주형 데이터 인코딩
for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

# 목표 변수와 특성 분리
X = train.drop('Click', axis=1)
y = train['Click']

from imblearn.under_sampling import RandomUnderSampler
y = (y > y.median()).astype(int)
# 언더샘플링
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)


# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# 분할된 데이터 저장
X_train.to_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/X_train.parquet')
X_test.to_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/X_test.parquet')
y_train.to_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/y_train.parquet')
y_test.to_parquet('/content/drive/MyDrive/Colab Notebooks/머신러닝 프로젝트/광고 클릭/y_test.parquet')
# 모델 초기화 및 훈련
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 성능 평가 보고서 생성
report = classification_report(y_test, y_pred)
print(report)

from sklearn.metrics import roc_auc_score
#AUC 계산
y_true = y_test  # 실제 타깃 값
y_pred = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_pred)
print(f'AUC: {auc:.4f}')

In [None]:
print(y.value_counts())

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
# 성능 평가 보고서 생성
report = classification_report(y_test, y_pred)
print(report)

from sklearn.metrics import roc_auc_score
#AUC 계산
y_true = y_test  # 실제 타깃 값
y_pred = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_pred)
print(f'AUC: {auc:.4f}')

# 혼동행렬 출력
cm = confusion_matrix(y_test, y_pred)
print(cm)

#feature_importance
from feature_importance import feature_importance
feature_importance(model, X_train, y_train)

In [None]:
# model_evaluation.py

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test):
    # 예측
    y_pred = model.predict(X_test)
    y_pred_binary = [1 if x >= 0.5 else 0 for x in y_pred]

    # 성능 평가 보고서 생성
    report = classification_report(y_test, y_pred_binary)
    print(report)

    # AUC 계산
    y_true = y_test  # 실제 타깃 값
    y_pred_proba = model.predict(X_test)[:, 1]  # LightGBM의 predict_proba
    auc = roc_auc_score(y_true, y_pred_proba)
    print(f'AUC: {auc:.4f}')

    # 혼동 행렬 출력
    cm = confusion_matrix(y_test, y_pred_binary)
    print(cm)

    return report, auc, cm

def plot_feature_importance(model, X_train):
    # 피처 중요도 출력
    feature_importances = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importance()
    }).sort_values(by='importance', ascending=False)

    plt.figure(figsize=(10, 8))
    sns.barplot(x='importance', y='feature', data=feature_importances)
    plt.title('Feature Importance')
    plt.show()

    return feature_importances
