In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')


Mounted at /content/gdrive/


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 데이터 로드
df = pd.read_csv('/content/gdrive/MyDrive/Simcar-AI-Modeling/data/used_cars_dataset.csv')

df['has_image'] = df['imageUrl'].notna().astype(int)

categorical_columns = ['brand', 'model', 'region']
for column in categorical_columns:
    df[column] = LabelEncoder().fit_transform(df[column])

features = ['brand', 'model', 'price', 'productionYear', 'mileage',
            'has_image', 'insuranceHistory', 'inspectionHistory', 'region']
X = df[features]
y = df['isFraud']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
numeric_features = ['price', 'productionYear', 'mileage', 'insuranceHistory', 'inspectionHistory']
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 모델 학습
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 성능 평가
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86       214
           1       0.90      0.74      0.81       186

    accuracy                           0.84       400
   macro avg       0.85      0.83      0.84       400
weighted avg       0.85      0.84      0.84       400


Confusion Matrix:
[[198  16]
 [ 48 138]]

ROC-AUC Score: 0.9053361471208923

Feature Importance:
             feature  importance
2              price    0.290696
6   insuranceHistory    0.238457
4            mileage    0.162030
5          has_image    0.097595
3     productionYear    0.067970
1              model    0.043315
8             region    0.039536
7  inspectionHistory    0.031793
0              brand    0.028608


In [None]:
import joblib

joblib.dump(model, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_model.joblib')

joblib.dump(scaler, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_scaler.joblib')

print("\nModel and scaler saved successfully.")


Model and scaler saved successfully.


In [None]:
import joblib
import pandas as pd

# 저장된 모델과 스케일러 불러오기
model = joblib.load('/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_model.joblib')
scaler = joblib.load('/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_scaler.joblib')

# 테스트할 새로운 데이터
new_data = {
    'brand': 0,  # Hyundai라고 가정
    'model': 0,  # Tucson이라고 가정
    'price': 35000,
    'productionYear': 2022,
    'mileage': 15000,
    'has_image': 1,  # 이미지 있음
    'insuranceHistory': 5,
    'inspectionHistory': 2,
    'region': 0  # Seoul이라고 가정
}

# DataFrame으로 변환
input_df = pd.DataFrame([new_data])

# 수치형 특성 스케일링
numeric_features = ['price', 'productionYear', 'mileage', 'insuranceHistory', 'inspectionHistory']
input_df[numeric_features] = scaler.transform(input_df[numeric_features])

# 예측
fraud_probability = model.predict_proba(input_df)[0][1]

print(f"사기 확률: {fraud_probability:.2%}")

사기 확률: 95.40%
