In [22]:
# 모델 분석에 필요한 라이브러리 import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
import platform

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest

# 모듈로 구현한 데이터 분석용 클래스 import 
from modules.DataAnalysis import DataCheck
from modules.DataAnalysis import DataPreprocessing
from modules.DataAnalysis import DataVisualize
import modules.DataModify as DataModify

In [23]:
### 데이터 로드

input_file_path = './data/Tree_data.csv'

df = pd.read_csv(input_file_path)

### 객체 인스턴스 생성
dc = DataCheck(df)
dp = DataPreprocessing(df)
dv = DataVisualize(df)

In [24]:
drop_cols = ['No', 'Plot', 'Adult', 'Subplot', 'Core', 'Census', 'Time']
dp.set_drop_cols(drop_cols)

encode = dp.run(encoding='label', return_anomaly=False)

In [25]:
alive_data = encode[encode['Alive'] != 1]
censored_data = encode[encode['Alive'] == 1]

train_set, test_set = DataModify.train_test_split_ignore_censored(alive_data, censored_data)

# (train_set[0] : X_train, train_set[1] : y_train, test_set[0] : X_test, test_set[1] : y_test)

print(train_set[1].value_counts())

Alive
0    1644
1     679
Name: count, dtype: int64


In [26]:
train_set[0]

Unnamed: 0,Species,Light_ISF,Light_Cat,Soil,Sterile,Conspecific,Myco,SoilMyco,PlantDate,AMF,EMF,Phenolics,Lignin,NSC
2630,3,0.079,0,5,0,0,0,0,0,37.60,0.00,2.09,7.83,11.34
1546,3,0.087,0,6,0,0,0,1,0,28.46,0.00,2.00,10.38,13.66
586,1,0.073,1,0,0,0,1,0,0,0.00,17.24,5.22,21.52,17.49
1785,2,0.032,1,2,0,0,1,0,1,12.68,29.10,4.93,23.42,16.56
1522,0,0.106,0,0,0,0,0,0,0,22.00,0.00,0.79,13.86,12.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,1,0.061,1,0,0,0,1,0,1,18.67,28.69,5.05,18.98,20.78
494,2,0.079,0,0,0,0,1,0,1,18.09,31.83,4.49,26.05,12.98
2267,3,0.110,0,4,1,1,0,2,0,11.97,0.00,1.35,8.43,11.83
1923,1,0.086,0,4,1,1,1,2,1,5.33,5.07,4.57,18.69,20.69


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

X_train, y_train = train_set
X_test, y_test = test_set

rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 학습
rf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix

# 예측
y_pred = rf.predict(X_test)

# 정확도
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# 혼동 행렬
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9237472766884531
Confusion Matrix:
 [[315   3]
 [ 32 109]]


In [29]:
lr_clf = LogisticRegression()

lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [30]:
# 예측
y_pred = lr_clf.predict(X_test)

# 정확도
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# 혼동 행렬
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7603485838779956
Confusion Matrix:
 [[306  12]
 [ 98  43]]
