## 테러 사건의 고위험성 분류 분석  
## Classification analysis of high-risk terrorism incidents


### 전처리된 테러 데이터 로드 및 분류 문제 정의  
### Load the cleaned terrorism dataset and define the classification task


In [3]:
import pandas as pd

df = pd.read_parquet("data/interim/gtd_clean.parquet")
df.shape


(139872, 20)

### 인명 피해 규모를 기준으로 한 이진 분류 변수 생성  
### Create a binary target variable based on casualty severity


In [5]:
# 타깃 정의: 사망+부상자 10명 이상이면 고위험 사건
df["high_casualty"] = (df["casualty"] >= 10).astype(int)

df["high_casualty"].value_counts(normalize=True)


high_casualty
0    0.871954
1    0.128046
Name: proportion, dtype: float64

### 분류 모델 학습을 위한 입력 변수 선택  
### Select input features and split the dataset into training and test sets


In [7]:
from sklearn.model_selection import train_test_split

features = [
    "region_txt",
    "attacktype1_txt",
    "weaptype1_txt",
    "targtype1_txt"
]

X = df[features].copy()
y = df["high_casualty"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((111897, 4), (27975, 4))

### 범주형 변수 인코딩 및 로지스틱 회귀 모델 구축  
### Encode categorical variables and build a logistic regression model


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), features)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", LogisticRegression(max_iter=1000))
])

model


### 로지스틱 회귀 모델 학습 및 예측 수행  
### Train the logistic regression model and generate predictions


In [13]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


### 분류 모델 성능 평가  
### Evaluate classification performance using standard metrics


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print()
print(classification_report(y_test, y_pred, digits=3))


[[24379    14]
 [ 3572    10]]

              precision    recall  f1-score   support

           0      0.872     0.999     0.931     24393
           1      0.417     0.003     0.006      3582

    accuracy                          0.872     27975
   macro avg      0.644     0.501     0.469     27975
weighted avg      0.814     0.872     0.813     27975



### 분류 결과 요약  
### Summary of classification results


In [17]:
result_summary = pd.DataFrame({
    "Metric": ["Accuracy", "Positive Class Share"],
    "Value": [
        (y_test == y_pred).mean(),
        y.mean()
    ]
})

result_summary


Unnamed: 0,Metric,Value
0,Accuracy,0.871814
1,Positive Class Share,0.128046
