# 데이터 2차 전처리

## 호출

In [None]:
import gdown

file_ids = {"data_visualization.csv": "1-1Wchi6E5S92U5-iSeOYdqHF8ucRx5tg"}

for filename, file_id in file_ids.items():
    url = f"https://drive.google.com/uc?export=download&id={file_id}"
    gdown.download(url, filename, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1-1Wchi6E5S92U5-iSeOYdqHF8ucRx5tg
From (redirected): https://drive.google.com/uc?export=download&id=1-1Wchi6E5S92U5-iSeOYdqHF8ucRx5tg&confirm=t&uuid=45ce2a73-95e1-4773-a267-5b333619c3c7
To: /content/data_visualization.csv
100%|██████████| 1.46G/1.46G [00:09<00:00, 152MB/s]


In [None]:
import pandas as pd

df = pd.read_csv('data_visualization.csv', low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6170813 entries, 0 to 6170812
Data columns (total 21 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   ID                    int64 
 1   Case Number           object
 2   Date                  object
 3   Block                 object
 4   IUCR                  object
 5   Primary Type          object
 6   Description           object
 7   Location Description  object
 8   Arrest                bool  
 9   Domestic              bool  
 10  Beat                  int64 
 11  District              object
 12  Ward                  object
 13  FBI Code              object
 14  X Coordinate          object
 15  Y Coordinate          object
 16  Year                  int64 
 17  Latitude              object
 18  Longitude             object
 19  Location              object
 20  Community Area Code   int64 
dtypes: bool(2), int64(4), object(15)
memory usage: 906.3+ MB


## 데이터 타입 정돈
- Date : 시간 정보니까 datetime으로
- XY Coordinate : 원래 숫자였으니 숫자로
- year : 잘못 적힌 41년을 2001년으로
- ward, community area : 수량적 의미가 없어 카테고리로 변환

→ 결측치 부활

In [None]:
# Date를 시간으로
# df['Date'] = pd.to_datetime(df['Date'])


  df['Date'] = pd.to_datetime(df['Date'])


In [None]:
# 모델 분석을 위해 Date 칼럼을 연/월/일 등 단위로 분해
# df['year'] = df['Date'].dt.year
# df['month'] = df['Date'].dt.month
# df['day'] = df['Date'].dt.day
# df['weekday'] = df['Date'].dt.weekday
# df['hour'] = df['Date'].dt.hour

In [None]:
# 숫자열 칼럼에서 unknown 변수 배제

df['X Coordinate'] = pd.to_numeric(df['X Coordinate'], errors='coerce')
df['Y Coordinate'] = pd.to_numeric(df['Y Coordinate'], errors='coerce')

df['Ward'] = pd.to_numeric(df['Ward'], errors='coerce')
df['Community Area Code'] = pd.to_numeric(df['Community Area Code'], errors='coerce')

In [None]:
# unknown이 존재하는 곳을 결측으로 전환해 계산 가능하게 변형

import numpy as np

df['Latitude'] = df['Latitude'].replace('unknown', np.nan)
df['Longitude'] = df['Longitude'].replace('unknown', np.nan)
df['Beat'] = df['Beat'].replace('unknown', np.nan)
df['District'] = df['District'].replace('unknown', np.nan)
df['Ward'] = df['Ward'].replace('unknown', np.nan)
df['Community Area Code'] = df['Community Area Code'].replace('unknown', np.nan).replace(0, np.nan)

df['Latitude'] = df['Latitude'].astype('float')
df['Longitude'] = df['Longitude'].astype('float')
df['Beat'] = df['Beat'].astype('float')
df['District']= df['District'].astype('float')
df['Ward'] = df['Ward'].astype('float')
df['Community Area Code'] = df['Community Area Code'].astype('float')

In [None]:
df.isna().sum()

Unnamed: 0,0
ID,0
Case Number,0
Date,0
Block,0
IUCR,0
Primary Type,0
Description,0
Location Description,0
Arrest,0
Domestic,0


In [None]:
# 이상치(잘못된 값 교정)

df.loc[df['X Coordinate'] < 0, ['X Coordinate', 'Y Coordinate', 'Longitude', 'Latitude']] = np.nan

In [None]:
df.isna().sum()

Unnamed: 0,0
ID,0
Case Number,0
Date,0
Block,0
IUCR,0
Primary Type,0
Description,0
Location Description,0
Arrest,0
Domestic,0


## 결측치 문제
- 6개 칼럼에 결측치 존재
- 결측치가 3000개 가까이 있는데 어떻게 할 것인가

  □ 결측치는 위치 정보에만 집중돼 있음
        △ 데이터 크기가 617만 개로 충분히 크니 모델 왜곡을 막기 위해 삭제하고 진행

- 카테고리로 지정한 칼럼 외의 문자형 데이터는 추후 독립변수로 쓸 때 레이블로 변환 → 모델 특성에 따라 인코딩 방법에 차별화

# 머신러닝 절차

In [None]:
# 마약 범죄 여부 맞히는 모델을 만들기 위해 칼럼 신설

df['Is_narcotics'] = df['Primary Type'].isin(['NARCOTICS', 'OTHER NARCOTIC VIOLATION'])

df['Is_narcotics']

Unnamed: 0,Is_narcotics
0,False
1,False
2,False
3,False
4,False
...,...
6170808,False
6170809,False
6170810,False
6170811,False


## 단독 모델 진행

### 로지스틱 회귀 단독 모델 진행

In [None]:
# 모델링에 앞서 필요한 칼럼만 추출
# 범죄 세부 내용과 관련이 없는 ID, Case Number 및 답에 대한 직접적 힌트를 주는 FBI CODE와 primary type, IUCR 그리고 파생 칼럼에 불과한 Location 칼럼 등 제거
# 위경도보다 평면 직교 좌표계인 x y 좌표를 사용하는 것이 공간적 왜곡이 일어나지 않으며 머신러닝 진행에도 유리하므로 다중공선성이 있는 위경도를 제거
# 로지스틱 회귀에는 인코딩 기법 중 원핫 인코딩을 사용하는 것이 유리하므로 원핫 인코딩으로 처리가 현실적으로 어려운 범주 5만 개인 block 칼럼까지 배제

dfl = df.copy()
dfl = dfl.drop(columns=['ID', 'Case Number', 'IUCR', 'Location', 'Latitude', 'Longitude', 'Primary Type', 'FBI Code', 'Description', 'Block', 'Date'])

# 모델링을 위한 파일 복사 및 결측치 제거(600만 개 중 2700여 개로 삭제해도 무리 없음)

dfl = dfl.dropna(axis=0)

dfl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3796494 entries, 0 to 3799676
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Location Description  object 
 1   Arrest                object 
 2   Domestic              object 
 3   Beat                  float64
 4   District              float64
 5   Ward                  float64
 6   X Coordinate          float64
 7   Y Coordinate          float64
 8   Year                  float64
 9   Community Area Code   float64
 10  Is_narcotics          bool   
dtypes: bool(1), float64(7), object(3)
memory usage: 322.2+ MB


In [None]:
# 훈련용 데이터와 테스트용 데이터를 1차 대분할

target = dfl.pop('Is_narcotics')

from sklearn.model_selection import train_test_split

x_train_log, x_test_log, y_train_log, y_test_log = train_test_split(dfl, target, random_state=0, test_size=0.2, stratify=target)

print(x_train_log.shape, x_test_log.shape, y_train_log.shape, y_test_log.shape)

(3037195, 10) (759299, 10) (3037195,) (759299,)


In [None]:
# 문자열이 포함된 자료 및 범주형 자료 인코딩 과정(선형 모델인 로지스틱 회귀 구사를 위해 원핫 인코딩 사용)
from sklearn.preprocessing import *
from sklearn.compose import ColumnTransformer

cols = ['Beat', 'District', 'Ward', 'Community Area Code', 'Location Description']

ct = ColumnTransformer(
    transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), cols)],
    remainder='passthrough')

x_train_ohe = ct.fit_transform(x_train_log[cols])
x_test_ohe = ct.transform(x_test_log[cols])

In [None]:
# 훈련용 데이터를 다시 훈련용-검증용 데이터로 재분할

x_train_log, x_val_log, y_train_log, y_val_log = train_test_split(x_train_ohe, y_train_log, random_state = 0, test_size = 0.2, stratify = y_train_log)

In [None]:
# 마약 범죄 건수가 그 범주에 속하지 않는 범죄보다 훨씬 많으므로 비-마약 범죄 기준으로 오버샘플링 진행
# 범주형 자료가 많을 때
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=0)
# x_smote, y_smote = smote.fit_resample(x_train_log, y_train_log)

In [None]:
# 로지스틱 회귀 모델 호출 및 학습
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression()

# param_grid_lr = {
#     'C': [0.01, 0.1, 0.5],
#     'max_iter': [500, 1000, 1500],
#     'penalty' : ['l1', 'l2']
# }

# gs_lr = GridSearchCV(lr, param_grid_lr, scoring='precision')
# gs_lr.fit(x_train_log, y_train_log)

# print("Best parameter :", gs_lr.best_params_)
# print("Best score:", gs_lr.best_score_)

lr.fit(x_train_log, y_train_log)

In [None]:
# 검증용 데이터로 1차적 모델 적합성 평가
pred = lr.predict(x_val_log)

from sklearn.metrics import *
accuracy_vallog = accuracy_score(y_val_log, pred)
precision_vallog = precision_score(y_val_log, pred)
f1_score_vallog = f1_score(y_val_log, pred)

print('accuracy :', accuracy_vallog, 'precision :', precision_vallog, 'f1_score :', f1_score_vallog)

# 매개변수 없이 작동했을 때 precision 0.6, f1_score 0.22


accuracy : 0.8912055366876345 precision : 0.6162595562185345 f1_score : 0.23080683458261558


In [None]:
# 테스트용 데이터로 과적합 및 전체적 모델 적합성 평가

prediction = lr.predict(x_test_ohe)

accuracy_testlog = accuracy_score(y_test_log, prediction)
precision_testlog = precision_score(y_test_log, prediction)
f1_score_testlog = f1_score(y_test_log, prediction)

print('accuracy :', accuracy_testlog, 'precision :', precision_testlog, 'f1_score :', f1_score_testlog)

accuracy : 0.8910639945528704 precision : 0.6156181993015841 f1_score : 0.2272731519109143


In [None]:
# 범주별 예측력 확인

report_lr = classification_report(y_test_log, prediction, target_names=['non-narcotic', 'narcotic'], output_dict=True)

report_df_lr = pd.DataFrame(report_lr).transpose()

report_df_lr = report_df_lr[['precision', 'recall', 'f1-score']].round(2)
report_df_lr

Unnamed: 0,precision,recall,f1-score
non-narcotic,0.9,0.99,0.94
narcotic,0.62,0.14,0.23
accuracy,0.89,0.89,0.89
macro avg,0.76,0.56,0.58
weighted avg,0.87,0.89,0.86


### 1. 로지스틱 회귀 모델 해석
- importance 확인

## 앙상블 모델 진행

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


### 랜덤 포레스트 모델 진행

In [None]:
# 모델링에 앞서 필요한 칼럼만 추출
# 범죄 세부 내용과 관련이 없는 ID, Case Number 및 답에 대한 직접적 힌트를 주는 FBI CODE와 primary type, IUCR 그리고 파생 칼럼에 불과한 Location 칼럼 등 제거
# 위경도보다 평면 직교 좌표계인 x y 좌표를 사용하는 것이 공간적 왜곡이 일어나지 않으며 머신러닝 진행에도 유리하므로 다중공선성이 있는 위경도를 제거
# 랜덤포레스트는 다른 인코딩 방법도 무난하게 사용할 수 있으므로 대량의 범주를 인코딩 가능한 바이너리 인코딩 적용 예정(block을 제거하지 않음)

dfr = df.copy()
dfr = dfr.drop(columns=['ID', 'Case Number', 'IUCR', 'Location', 'Latitude', 'Longitude', 'Primary Type', 'FBI Code', 'Description', 'Date'])

# 모델링을 위한 파일 복사 및 결측치 제거

dfr = dfr.dropna(axis=0)

dfr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3796494 entries, 0 to 3799676
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Block                 object 
 1   Location Description  object 
 2   Arrest                object 
 3   Domestic              object 
 4   Beat                  float64
 5   District              float64
 6   Ward                  float64
 7   X Coordinate          float64
 8   Y Coordinate          float64
 9   Year                  float64
 10  Community Area Code   float64
 11  Is_narcotics          bool   
dtypes: bool(1), float64(7), object(4)
memory usage: 351.2+ MB


In [None]:
# 훈련용 데이터와 테스트용 데이터를 1차 대분할

target = dfr.pop('Is_narcotics')

from sklearn.model_selection import train_test_split

x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(dfr, target, random_state=0, test_size=0.2, stratify=target)

print(x_train_rf.shape, x_test_rf.shape, y_train_rf.shape, y_test_rf.shape)

(3037195, 11) (759299, 11) (3037195,) (759299,)


In [None]:
# 바이너리 인코딩 적용
import category_encoders as ce

cols = ['Block', 'Location Description', 'Beat', 'District', 'Ward', 'Community Area Code']
bc = ce.BinaryEncoder(cols = cols)

x_train_bc = bc.fit_transform(x_train_rf)
x_test_bc = bc.transform(x_test_rf)

In [None]:
# 훈련용 데이터 분할

x_train_rfc, x_val_rfc, y_train_rfc, y_val_rfc = train_test_split(x_train_bc, y_train_rf, random_state = 0, test_size = 0.2, stratify = y_train_rf)

In [None]:
# 로지스틱 회귀 모델 호출 및 학습
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state = 0)
# param_grid_rfc = {'max_depth' : [5, 7, 9],
#               'n_estimators' : [300, 500, 700],
#               'ccp_alpha' : [0.01, 0.03, 0.05]}

# gs_rfc = GridSearchCV(rfc, param_grid_rfc, scoring='precision')
# gs_rfc.fit(x_train_rfc, y_train_rfc)

# print("Best parameter:", gs_rfc.best_params_)
# print("Best CV score:", gs_rfc.best_score_)

rfc.fit(x_train_rfc, y_train_rfc)

In [None]:
# 검증용 데이터로 1차적 모델 적합성 평가
pred = rfc.predict(x_val_rfc)

accuracy_valrfc= accuracy_score(y_val_rfc, pred)
precision_valrfc = precision_score(y_val_rfc, pred)
f1_score_valrfc = f1_score(y_val_rfc, pred)

print('accuracy :', accuracy_valrfc, 'precision :', precision_valrfc, 'f1_score :', f1_score_valrfc)

# 매개변수 없이 작동했을 때 정확도 0.93, 정밀도 0.69 f1 0.67

accuracy : 0.930628802834315 precision : 0.6937544218221838 f1_score : 0.6735000500818003


In [None]:
# 테스트용 데이터로 2차 평가

# prediction = gs_rfc.predict(x_test_bc)

prediction = rfc.predict(x_test_bc)

from sklearn.metrics import *

accuracy_testrfc = accuracy_score(y_test_rf, prediction)
precision_testrfc = precision_score(y_test_rf, prediction)
f1_score_testrfc = f1_score(y_test_rf, prediction)

print('accuracy :', accuracy_testrfc, 'precision :', precision_testrfc, 'f1_score :', f1_score_testrfc)

accuracy : 0.9311657093286744 precision : 0.6962777284478355 f1_score : 0.676089759730202


In [None]:
# feature importance 확인

importance_rfc = rfc.feature_importances_

feature_names = x_train_rfc.columns

importance_df_rfc = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_rfc
}).sort_values(by='Importance', ascending=False)

importance_df_rfc

Unnamed: 0,Feature,Importance
24,Arrest,0.331642
48,Year,0.117813
47,Y Coordinate,0.110896
46,X Coordinate,0.106881
23,Location Description_7,0.046656
25,Domestic,0.029561
22,Location Description_6,0.019523
20,Location Description_4,0.017133
21,Location Description_5,0.013628
19,Location Description_3,0.01258


In [None]:
# 범주별 예측력 확인

report_rfc = classification_report(y_test_rf, prediction, target_names=['non-narcotic', 'narcotic'], output_dict=True)

report_df_rfc = pd.DataFrame(report_rfc).transpose()

report_df_rfc = report_df_rfc[['precision', 'recall', 'f1-score']].round(2)
report_df_rfc

Unnamed: 0,precision,recall,f1-score
non-narcotic,0.96,0.96,0.96
narcotic,0.7,0.66,0.68
accuracy,0.93,0.93,0.93
macro avg,0.83,0.81,0.82
weighted avg,0.93,0.93,0.93


### 부스팅 모델 진행

### xgboost

In [None]:
dfx = df.copy()
dfx = dfx.drop(columns=['ID', 'Case Number', 'IUCR', 'Location', 'Latitude', 'Longitude', 'Primary Type', 'FBI Code', 'Description', 'Date'])

# 모델링을 위한 파일 복사 및 결측치 제거

dfx = dfx.dropna(axis=0)

dfx.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6167273 entries, 0 to 6170812
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Block                 object 
 1   Location Description  object 
 2   Arrest                bool   
 3   Domestic              bool   
 4   Beat                  float64
 5   District              float64
 6   Ward                  float64
 7   X Coordinate          float64
 8   Y Coordinate          float64
 9   Year                  int64  
 10  Community Area Code   float64
 11  Is_narcotics          bool   
dtypes: bool(3), float64(6), int64(1), object(2)
memory usage: 488.2+ MB


In [None]:
# 훈련용 데이터와 테스트용 데이터를 1차 대분할

target = dfx.pop('Is_narcotics')

from sklearn.model_selection import train_test_split

x_train_xg, x_test_xg, y_train_xg, y_test_xg = train_test_split(dfx, target, random_state=0, test_size=0.2, stratify=target)

print(x_train_xg.shape, x_test_xg.shape, y_train_xg.shape, y_test_xg.shape)

(4933818, 11) (1233455, 11) (4933818,) (1233455,)


In [None]:
import category_encoders as ce

cols = ['Block', 'Location Description', 'Beat', 'District', 'Ward', 'Community Area Code']
bc = ce.BinaryEncoder(cols = cols)

x_train_bc2 = bc.fit_transform(x_train_xg)
x_test_bc2 = bc.transform(x_test_xg)

In [None]:
# 훈련용 데이터 분할

x_train_xgb, x_val_xgb, y_train_xgb, y_val_xgb = train_test_split(x_train_bc2, y_train_xg, random_state = 0, test_size = 0.2, stratify = y_train_xg)

In [None]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(random_state=0, n_jobs = -1)

# param_grid_xgbc = {'booster' : ['gblinear', 'gbtree'],
#                    'learning_rate' : [0.01, 0.05, 0.1]}

# gs_xgbc = GridSearchCV(xgbc, param_grid_xgbc, scoring='precision')
# gs_xgbc.fit(x_train_xgb, y_train_xgb)

# print("Best parameter:", gs_xgbc.best_params_)
# print("Best CV score:", gs_xgbc.best_score_)

xgbc.fit(x_train_xgb, y_train_xgb)

In [None]:
# 검증용 데이터로 1차적 모델 적합성 평가
pred = xgbc.predict(x_val_xgb)

accuracy_valxgb= accuracy_score(y_val_xgb, pred)
precision_valxgb = precision_score(y_val_xgb, pred)
f1_score_valxgb = f1_score(y_val_xgb, pred)

print('accuracy :', accuracy_valxgb, 'precision :', precision_valxgb, 'f1_score :', f1_score_valxgb)

# 매개변수 없이 작동했을 때 정확도 0.93, 정밀도 0.70 f1 0.70

accuracy : 0.9348820994685659 precision : 0.7035595987870306 f1_score : 0.7012238217460849


In [None]:
# 테스트용 데이터로 2차 평가

# prediction = gs_xgb.predict(x_test_bc)

prediction = xgbc.predict(x_test_bc2)

from sklearn.metrics import *

accuracy_testxgb = accuracy_score(y_test_xg, prediction)
precision_testxgb = precision_score(y_test_xg, prediction)
f1_score_testxgb = f1_score(y_test_xg, prediction)

print('accuracy :', accuracy_testxgb, 'precision :', precision_testxgb, 'f1_score :', f1_score_testxgb)

accuracy : 0.9353815096618847 precision : 0.7058665452401036 f1_score : 0.7035086153022052


In [None]:
# feature importance 확인

importance_xgb = xgbc.feature_importances_

feature_names = x_train_xgb.columns

importance_df_xgb = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_xgb
}).sort_values(by='Importance', ascending=False)

importance_df_xgb

Unnamed: 0,Feature,Importance
24,Arrest,0.737531
25,Domestic,0.105164
23,Location Description_7,0.029814
20,Location Description_4,0.019714
36,District_1,0.01446
22,Location Description_6,0.01116
19,Location Description_3,0.009196
21,Location Description_5,0.006868
18,Location Description_2,0.005415
35,District_0,0.005171


In [None]:
# 범주별 예측력 확인

report_xgbc = classification_report(y_test_xg, prediction, target_names=['non-narcotic', 'narcotic'], output_dict=True)

report_df_xgbc = pd.DataFrame(report_xgbc).transpose()

report_df_xgbc = report_df_xgbc[['precision', 'recall', 'f1-score']].round(2)
report_df_xgbc

Unnamed: 0,precision,recall,f1-score
non-narcotic,0.96,0.96,0.96
narcotic,0.71,0.7,0.7
accuracy,0.94,0.94,0.94
macro avg,0.83,0.83,0.83
weighted avg,0.94,0.94,0.94


### lightGBM

In [None]:
dfm = df.copy()
dfm = dfm.drop(columns=['ID', 'Case Number', 'IUCR', 'Location', 'Latitude', 'Longitude', 'Primary Type', 'FBI Code', 'Description', 'Date'])

# 모델링을 위한 파일 복사 및 결측치 제거

dfm = dfm.dropna(axis=0)

dfm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6167273 entries, 0 to 6170812
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Block                 object 
 1   Location Description  object 
 2   Arrest                bool   
 3   Domestic              bool   
 4   Beat                  float64
 5   District              float64
 6   Ward                  float64
 7   X Coordinate          float64
 8   Y Coordinate          float64
 9   Year                  int64  
 10  Community Area Code   float64
 11  Is_narcotics          bool   
dtypes: bool(3), float64(6), int64(1), object(2)
memory usage: 488.2+ MB


In [None]:
# 수량의 의미가 없는 ward, community area, beat를 카테고리로 변환

# dfm['Ward'] = dfm['Ward'].astype('category')
# dfm['Community Area Code'] = dfm['Community Area Code'].astype('category')
# dfm['Beat'] = dfm['Beat'].astype('category')

In [None]:
# 훈련용 데이터와 테스트용 데이터를 1차 대분할

target = dfm.pop('Is_narcotics')

from sklearn.model_selection import train_test_split

x_train_lgb, x_test_lgb, y_train_lgb, y_test_lgb = train_test_split(dfm, target, random_state=0, test_size=0.2, stratify=target)

print(x_train_lgb.shape, x_test_lgb.shape, y_train_lgb.shape, y_test_lgb.shape)

(4933818, 11) (1233455, 11) (4933818,) (1233455,)


In [None]:
import category_encoders as ce

cols = ['Block', 'Location Description', 'Beat', 'District', 'Ward', 'Community Area Code']
bc = ce.BinaryEncoder(cols = cols)

x_train_bc3 = bc.fit_transform(x_train_lgb)
x_test_bc3 = bc.transform(x_test_lgb)

In [None]:
# 훈련용 데이터 분할

x_train_lgb, x_val_lgb, y_train_lgb, y_val_lgb = train_test_split(x_train_bc3, y_train_lgb, random_state = 0, test_size = 0.2, stratify = y_train_lgb)

In [None]:
import lightgbm as lgbm

lgb = lgbm.LGBMClassifier(n_jobs = -1)

# param_grid_lgb = {'n_estimators' : [100, 200],
#                    'learning_rate' : [0.01, 0.05, 0.1]}

# gs_lgb = GridSearchCV(xgbc, param_grid_lgb, scoring='precision')
# gs_lgb.fit(x_train_lgb, y_train_lgb)

# print("Best parameter:", gs_lgb.best_params_)
# print("Best CV score:", gs_lgb.best_score_)

lgb.fit(x_train_lgb, y_train_lgb)

[LightGBM] [Info] Number of positive: 431557, number of negative: 3515497
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.275568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 634
[LightGBM] [Info] Number of data points in the train set: 3947054, number of used features: 56
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.109336 -> initscore=-2.097537
[LightGBM] [Info] Start training from score -2.097537


In [None]:
# 검증용 데이터로 1차적 모델 적합성 평가
pred = lgb.predict(x_val_lgb)

accuracy_vallgb = accuracy_score(y_val_lgb, pred)
precision_vallgb = precision_score(y_val_lgb, pred)
f1_score_vallgb = f1_score(y_val_lgb, pred)

print('accuracy :', accuracy_vallgb, 'precision :', precision_vallgb, 'f1_score :', f1_score_vallgb)

# 매개변수 없이 작동했을 때 정확도 0.93, 정밀도 0.7, f1 0.69

accuracy : 0.932524899570718 precision : 0.6965408954655755 f1_score : 0.687370289331092


In [None]:
# 테스트용 데이터로 2차 평가

# prediction = gs_lgb.predict(x_test_bc)

prediction = lgb.predict(x_test_bc3)

from sklearn.metrics import *

accuracy_testlgb = accuracy_score(y_test_lgb, prediction)
precision_testlgb = precision_score(y_test_lgb, prediction)
f1_score_testlgb = f1_score(y_test_lgb, prediction)

print('accuracy :', accuracy_testlgb, 'precision :', precision_testlgb, 'f1_score :', f1_score_testlgb)

accuracy : 0.9329233737752898 precision : 0.6985094180103739 f1_score : 0.689140040277736


In [None]:
report_lgb= classification_report(y_test_lgb, prediction, target_names=['non-narcotic', 'narcotic'], output_dict=True)

report_df_lgb = pd.DataFrame(report_lgb).transpose()

report_df_lgb = report_df_lgb[['precision', 'recall', 'f1-score']].round(2)
report_df_lgb

Unnamed: 0,precision,recall,f1-score
non-narcotic,0.96,0.96,0.96
narcotic,0.7,0.68,0.69
accuracy,0.93,0.93,0.93
macro avg,0.83,0.82,0.83
weighted avg,0.93,0.93,0.93
