## 유기동물 데이터를 이용한 입양률 예측

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn import metrics, preprocessing
from scipy.stats import itemfreq
import mglearn

### 1. Data Loading

In [None]:
#df4=pd.read_csv("D:\project_imsi\lostAnimal_20180101_20181231_vol3.csv", encoding="euc-kr")
#df3=pd.read_csv("D:\project_imsi\lostAnimal_20170101_20171231_vol3.csv", encoding="euc-kr")
#df2=pd.read_csv("D:\project_imsi\lostAnimal_20160101_20161231_vol3.csv", encoding="euc-kr")
df1=pd.read_csv("D:\project_imsi\lostAnimal_20150101_20151231_vol3.csv", encoding="euc-kr")


In [None]:
df1.tail(1)

In [None]:
df1.columns

### 2. Data PreProcessing

#### - 필요없는 컬럼 삭제

In [None]:
df = df1.drop(columns=['age(before)','careAddr', 'careNm', 'careTel', 'chargeNm',
                      'desertionNo', 'filename', 'happenDt', 'happenPlace', 'kindCd','noticeComment',
                       'noticeNo', 'noticeSdt','officetel', 'popfile', 'processState',
                       'weight(before)', 'specialMark','breed', 
                      ])
df.head(3)
#'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1',

In [None]:
print("사용할 컬럼 갯수 : ",len(df.columns))
print("컬럼 이름 : ", df.columns)

In [None]:
df.info()

#### - 결측치 처리

In [None]:
#print(df.isnull().sum()) # 결측치 수
#df.isnull().any()
#null_columns=df.columns[df.isnull().any()]
#df[null_columns].isnull().sum()
#print(df[df["breed_Pre"].isnull()][null_columns])        
print(df['weight(after)'].isnull().sum())       
print(df['size'].isnull().sum())                

In [None]:
# weight(after) : 나이별로 묶어서 중위값넣기
df["weight(after)"].fillna(df.groupby("age_u")["weight(after)"].transform("median"), inplace=True)
df.loc[pd.isnull(df["weight(after)"])]

In [None]:
# size : 몸무게가 비어서 전처리 되지 않은 행 재전처리
df.loc[df['weight(after)'] <= 3, 'size'] = '초소형'
df.loc[(df['weight(after)'] > 3 ) & (df['weight(after)'] <=9 ), 'size'] = '소형'
df.loc[(df['weight(after)'] > 9 ) & (df['weight(after)'] <=25 ), 'size'] = '중형'
df.loc[df['weight(after)'] > 25, 'size'] = '대형'

df.loc[pd.isnull(df["size"])]

In [None]:
# 동물 종 : 몇개 안되니 기타로 입력
df['breed_Pre'].fillna("기타",inplace=True)
df.loc[pd.isnull(df["breed_Pre"])]

In [None]:
df.isnull().sum()

#### - colorCd : 대표적 색상으로 분류

#### - neuterYn : 문자 -> 숫자

In [None]:
neuter_mapping = {"Y":0,"N":1,"U":2}
df['neuterYn'] = df['neuterYn'].map(neuter_mapping)
df.neuterYn[1:3]

#### - sexCd : 문자 -> 숫자

In [None]:
sex_mapping = {"M":0,"F":1,"Q":2}
df['sexCd'] = df['sexCd'].map(sex_mapping)
df.sexCd[1:3]

#### - orgNm(담당지역주소) : 두분류로 나눈뒤, 숫자 mapping

In [None]:
df['sido'] = df['orgNm'].str.split(" ").str[0]
df['sido'].value_counts()

In [None]:
sido_mapping = {"경기도":0,"서울특별시":1,"부산광역시":2,"경상남도":3,
                "인천광역시":4,"충청남도":5,"강원도":6,"대구광역시":7,
                "전라북도":8,"경상북도":9,"대전광역시":10,"울산광역시":11,
                "충청북도":12,"전라남도":13,"제주특별자치도":14,"광주광역시":15,
                "세종특별자치시":16
}
df['sido'] = df['sido'].map(sido_mapping)
df['sido'].head(3)

#### - breed_Pre(유기동물종류) : 

#### - happenWd(발견요일) : 문자 -> 숫자 mapping 

In [None]:
week_mapping = {"Monday":0, "Tuesday":2, "Wednesday":3, 
                "Thursday":4, "Friday":5, "Saturday":6, "Sunday":7}
df['happenWd'] = df['happenWd'].map(week_mapping)
df.tail(3)

#### - size : 문자 -> 숫자 mapping

In [None]:
df['size'].fillna('중형',inplace=True)
size_mapping = {"대형":0,"소형":1,"중형":2,"초소형":3}
df['size'] = df['size'].map(size_mapping)

df.head(3)

#### - age : 문자 -> 숫자 mapping 

In [None]:
age_mapping = {"노견기":0,"성견기":1,"유견기":2}
df['age_u'] = df['age_u'].map(age_mapping)

df.head(2)

#### - processState_Pre : 문자 -> 숫자 mapping 

In [None]:
proc_mapping = {"C":0, "A":1, "D":2, "R":3, "E":4}
df['processState_Pre'] = df['processState_Pre'].map(proc_mapping)

df.head(2)

In [None]:
df.head()

### 3. Data Statistical Analysis

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
# processState와 상관관계
cor_t = np.round(df.corr(),3)
cor1 = cor_t.loc[:,[ 'processState_Pre', 'processState_C', 'processState_A',
       'processState_D', 'processState_R', 'processState_E']]
cor1

### 4. Data Learning

In [None]:
# null값 처리
# df.isnull().any()
#null_columns=df.columns[df.isnull().any()]
#df[null_columns].isnull().sum()
#print(df[df["breed_Pre"].isnull()][null_columns])

#### - feature 선택

In [None]:
# feature_name = ['age(after)','neuterYn','sexCd','weight(after)', 'kind', 'happenWd', 'happenMth', 
#                 'size', 'processState_Pre', 'sido']
# df_Pre = df[feature_name]

# feature_name2 = ['age(after)','neuterYn','sexCd','weight(after)', 'kind', 'happenWd', 'happenMth', 
#                 'size', 'processState_A', 'sido']
# df_A = df[feature_name2]

# df_Pre.head(3)
# #df_A.head(3)

In [None]:
feature_name = ['kind', 'happenWd', 'happenMth','size','age_u', 'sexCd_M',
               'sexCd_F', 'sexCd_Q', 'neuterYn_Y', 'neuterYn_N', 'neuterYn_U',
               'careNm_ETC', 'careNm_H', 'careNm_C', 'careNm_O', 'careNm_AD',
               'careNm_CM', 'sido','processState_Pre', 'sido']

df_Pre = df[feature_name]

feature_name2 = ['kind', 'happenWd', 'happenMth','size','age_u', 'sexCd_M',
               'sexCd_F', 'sexCd_Q', 'neuterYn_Y', 'neuterYn_N', 'neuterYn_U',
               'careNm_ETC', 'careNm_H', 'careNm_C', 'careNm_O', 'careNm_AD',
               'careNm_CM', 'sido','processState_A', 'sido']
df_A = df[feature_name2]

df_Pre.head(3)
df_A.head(3)

#### - 종속변수 독립변수 추출

###### 1) processState_Pre 기준

In [None]:
X=np.array(df_Pre.drop(columns='processState_Pre')) #종속변수
Y=np.array(df_Pre.processState_Pre) #독립변수

In [None]:
print(df['processState_Pre'].value_counts())
table = itemfreq(Y)
x_ticks = ['0','1','2','3','4'] 
# 0 : 보호중, 1:입양,기증, 2:자연사,안락사, 3:반환, 4:방사,미포획
plt.bar(x_ticks, table[:,1], color='skyblue')
table[:,1]

###### 2) processState_A 기준

In [None]:
Z=np.array(df_A.drop(columns='processState_A')) #종속변수
Q=np.array(df_A.processState_A) #독립변수

In [None]:
print(df_A['processState_A'].value_counts())
table = itemfreq(Q)
x_ticks = ['0','1'] # 0 : 입양X, 1:입양O
plt.bar(x_ticks, table[:,1], color='pink')
table[:,1]

##### - 데이터셋 나누기

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3, random_state=5)
Z_train, Z_test, Q_train, Q_test = train_test_split(Z,Q,test_size=0.3, random_state=5)

# train_test_split
# : 데이터를  train set과 test set으로 단순분리
# : 원래 트레이닝 데이터와 결과값이 X와 Y를 무작위로 섞은 후 test_size로 주어진 비율만큼 나눔
# test_size 0.3은 X와 Y를 무작위로 섞은 후에 70%은 train으로, 30%는 test로 하는것
# random_state : 난수발생을 위한 seed의 인자값

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
print(Z_train.shape)
print(Z_test.shape)
print(Q_train.shape)
print(Q_test.shape)

#### - 모형 적용 : SVM

In [None]:
C_grid = [0.001, 0.01, 0.1, 1, 10]
gamma_grid = [0.001, 0.01, 0.1, 1]
parameters = {'C': C_grid, 'gamma' : gamma_grid}

gridCV_Pre = GridSearchCV(SVC(kernel='rbf'), parameters, cv=10);
gridCV_Pre.fit(X_train, Y_train)

gridCV_A = GridSearchCV(SVC(kernel='rbf'), parameters, cv=10);
gridCV_A.fit(X_train, Y_train)


best_C_Pre = gridCV_Pre.best_params_['C']
best_gamma_Pre = gridCV_Pre.best_params_['gamma']

best_C_A = gridCV_A.best_params_['C']
best_gamma_A = gridCV_A.best_params_['gamma']

In [None]:
print("SVM best C : " + str(best_C_Pre))
print("SVM best gamma : " + str(best_gamma_Pre))

print("SVM best C : " + str(best_C_A))
print("SVM best gamma : " + str(best_gamma_A))

In [None]:
SVM_best_Pre = SVC(C=best_C,gamma=best_gamma)
SVM_best_Pre.fit(X_train, Y_train);

SVM_best_A = SVC(C=best_C,gamma=best_gamma)
SVM_best_A.fit(Z_train, Q_train);

#### - 예측

In [None]:
# X_new = np.array([[5,1,0,4.5,0,1,11,2,12]])
# Z_new = np.array([[5,1,0,4.5,0,1,11,2,12]])
# print("X_new.shape : {}".format(X_new.shape))
# print("Z_new.shape : {}".format(Z_new.shape))

In [None]:
X_new = np.array([[0,3,11,2,1,0,0,1,1,0,0,0,0,0,1,0,0,4,3]])
Z_new = np.array([[0,3,11,2,1,0,0,1,1,0,0,0,0,0,1,0,0,4,3]])
print("X_new.shape : {}".format(X_new.shape))
print("Z_new.shape : {}".format(Z_new.shape))

In [None]:
target_Pre = {0:'보호중', 1:'입양기증', 2:'자연사안락사', 3:'반환', 4:'방사미포획'}
target_A = {0:'입양X', 1:'입양O'}

In [None]:
prediction_Pre = RF_best_Pre.predict(X_new)
print("예측:{}".format(prediction_Pre))
p_Pre = int(prediction_Pre)
print("예측한 타깃의 이름 : {}".format(target_Pre[p_Pre]))

prediction_A = RF_best_A.predict(X_new)
print("예측:{}".format(prediction_A))
p_A = int(prediction_A)
print("예측한 타깃의 이름 : {}".format(target_A[p_A]))

In [None]:
#테스트데이터 사용 (모델을 만들때 사용하지 않았고, 테스트셋에 있는 각 붓꽃의 품종을 정확히 알고있음)
Y_pred =RF_best_Pre.predict(X_test) 
print("테스트 셋에 대한 예측값: {}".format(Y_pred))

Q_pred = RF_best_A.predict(Z_test) 
print("테스트 셋에 대한 예측값: {}".format(Q_pred))

##### - 검증

In [None]:
# 예측값 y_pred와 기존의 정답인 y_test가 맞는지 확인
print("테스트 셋에 대한 정확도 : {:.2f}".format(np.mean(Y_pred == Y_test)))
print("테스트 셋에 대한 정확도 : {:.2f}".format(np.mean(Q_pred == Q_test)))

In [None]:
print("테스트 셋트에 대한 정확도 : {:.2f}".format(RF_best_Pre.score(X_test, Y_test)))
print("테스트 셋트에 대한 정확도 : {:.2f}".format(RF_best_A.score(Z_test, Q_test)))