In [4]:
from google.colab import drive
drive.mount('/content/gdrive/')


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [10]:
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 데이터 로드
df = pd.read_csv('/content/gdrive/MyDrive/Simcar-AI-Modeling/data/new_used_cars_dataset.csv')

df['has_image'] = df['imageUrl'].notna().astype(int)

categorical_columns = ['brand', 'model', 'region']
for column in categorical_columns:
    df[column] = LabelEncoder().fit_transform(df[column])

features = ['brand', 'model', 'price', 'productionYear', 'mileage',
            'has_image', 'insuranceHistory', 'inspectionHistory', 'region']
X = df[features]
y = df['isFraud']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
numeric_features = ['price', 'productionYear', 'mileage', 'insuranceHistory', 'inspectionHistory']
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 모델 학습
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# 모델 저장
joblib.dump(model, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_model.joblib')
joblib.dump(scaler, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_scaler.joblib')

# 예측
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 성능 평가
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)



Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.95      0.86       233
           1       0.90      0.65      0.76       167

    accuracy                           0.82       400
   macro avg       0.85      0.80      0.81       400
weighted avg       0.84      0.82      0.82       400


Confusion Matrix:
[[221  12]
 [ 58 109]]

ROC-AUC Score: 0.8884377168409962

Feature Importance:
             feature  importance
2              price    0.274198
6   insuranceHistory    0.216167
4            mileage    0.152422
5          has_image    0.091659
3     productionYear    0.086591
1              model    0.051359
8             region    0.048018
0              brand    0.043988
7  inspectionHistory    0.035599


In [None]:
import joblib

joblib.dump(model, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_model.joblib')

joblib.dump(scaler, '/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_scaler.joblib')

print("\nModel and scaler saved successfully.")


Model and scaler saved successfully.


In [15]:
import joblib
import pandas as pd

# 저장된 모델과 스케일러 불러오기
model = joblib.load('/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_model.joblib')
scaler = joblib.load('/content/gdrive/MyDrive/Simcar-AI-Modeling/model/car_fraud_scaler.joblib')

# 테스트할 새로운 데이터 (사기 및 정상 케이스)
test_cases = [
    {
        'brand': 0,  # genesis g90 (사기 가능성 높음)
        'model': 0,
        'price': 105509788,
        'productionYear': 2018,
        'mileage': 49068,
        'has_image': 0,
        'insuranceHistory': 6,
        'inspectionHistory': 0,
        'region': 1  # Ulsan
    },
    {
        'brand': 1,  # kia sorento (정상 가능성 높음)
        'model': 1,
        'price': 40918420,
        'productionYear': 2018,
        'mileage': 22142,
        'has_image': 1,
        'insuranceHistory': 1,
        'inspectionHistory': 0,
        'region': 2  # Daegu
    }
]

# DataFrame으로 변환
input_df = pd.DataFrame(test_cases)

# 수치형 특성 스케일링
numeric_features = ['price', 'productionYear', 'mileage', 'insuranceHistory', 'inspectionHistory']
input_df[numeric_features] = scaler.transform(input_df[numeric_features])

# 예측 수행
fraud_probabilities = model.predict_proba(input_df)[:, 1]

# 결과 출력
for i, prob in enumerate(fraud_probabilities):
    print(f"테스트 케이스 {i+1} 사기 확률: {prob:.2%}")


테스트 케이스 1 사기 확률: 90.60%
