## Model Learning
- bagging 방법을 이용한 Random Forest
- Boosting 방법을 이용

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

#UnderSampling library
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.rcParams["figure.figsize"] = (8, 6)

In [17]:
header_list = ["age","workclass","fnlwgt","education","education_num","marital-status","occupation",
               "relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country",
               "income"] #15 columns (14 features, 1 target).
dataset = pd.read_csv("C:/Users/ksmin/ds_teamproject/Dataset/Adult Data.csv", names=header_list)

In [18]:
#y값을 변형하는 과정
dataset['income'] = dataset['income'].map(lambda x: 1 if x==' >50K' else 0)

In [6]:
#Random UnderSampling
X = dataset.iloc[:,0:-1]
y = dataset['income']
rus = RandomUnderSampler(random_state = 123)
X_res, y_res = rus.fit_resample(X, y)

print(X_res.shape)
print(y_res.shape)

dataset = pd.concat([X_res,y_res], axis=1)

(15682, 14)
(15682,)


In [24]:
# 특성과 타겟 분리
X = dataset.drop('income', axis=1)
y = dataset['income']

#LabelEncoder를 진행. 가중치가 없는 것으로 이용
String_index = []
for i, col in enumerate (X.columns):
    if X[col].dtype==object:
        String_index.append(col)

encoder = LabelEncoder()
for col in String_index:
    X[col] = encoder.fit_transform(X[col])

print(X.head())

dataset = pd.concat([X,y], axis=1)

   age  workclass  fnlwgt  education  education_num  marital-status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0             0             0   

   hours-per-week  native-country  
0              40              39  
1              13              39  
2              40              39  
3       

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=dataset.income)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
#train 데이터를 살펴보자
train, test = train_test_split(dataset, test_size=0.2, random_state=2024)

print(train.shape)

(12545, 15)


In [10]:
# BaggingClassifier 모델 학습
bagging_model = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=100,
    random_state=42
)
bagging_model.fit(X_train, y_train)

# 예측 및 평가
y_pred_bagging = bagging_model.predict(X_test)
print("Bagging Classifier Accuracy:", accuracy_score(y_test, y_pred_bagging))
print("Bagging Classifier Classification Report:\n", classification_report(y_test, y_pred_bagging))

Bagging Classifier Accuracy: 0.8100095632770162
Bagging Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.80      0.81      1569
           1       0.80      0.82      0.81      1568

    accuracy                           0.81      3137
   macro avg       0.81      0.81      0.81      3137
weighted avg       0.81      0.81      0.81      3137



In [11]:
# AdaBoostClassifier 모델 학습
boosting_model = AdaBoostClassifier(
    DecisionTreeClassifier(),
    n_estimators=100,
    random_state=42
)
boosting_model.fit(X_train, y_train)

# 예측 및 평가
y_pred_boosting = boosting_model.predict(X_test)
print("AdaBoost Classifier Accuracy:", accuracy_score(y_test, y_pred_boosting))
print("AdaBoost Classifier Classification Report:\n", classification_report(y_test, y_pred_boosting))

AdaBoost Classifier Accuracy: 0.7688874721071087
AdaBoost Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.77      0.77      1569
           1       0.77      0.77      0.77      1568

    accuracy                           0.77      3137
   macro avg       0.77      0.77      0.77      3137
weighted avg       0.77      0.77      0.77      3137





In [12]:
# 랜덤포레스트 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 예측
y_pred = rf_model.predict(X_test)

# 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8198916161938158
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1569
           1       0.81      0.83      0.82      1568

    accuracy                           0.82      3137
   macro avg       0.82      0.82      0.82      3137
weighted avg       0.82      0.82      0.82      3137



In [13]:
# XGBoost 모델 학습
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# 예측
y_pred = xgb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.8371055148230794
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.80      0.83      1569
           1       0.82      0.87      0.84      1568

    accuracy                           0.84      3137
   macro avg       0.84      0.84      0.84      3137
weighted avg       0.84      0.84      0.84      3137



In [14]:
# 로지스틱 회귀 모델 학습
lr_model = LogisticRegression(max_iter=1000)  # max_iter는 수렴까지의 반복 횟수, 필요에 따라 조정 가능
lr_model.fit(X_train_scaled, y_train)

# 예측
y_pred = lr_model.predict(X_test_scaled)

# 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7590054191903092
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      1569
           1       0.76      0.75      0.76      1568

    accuracy                           0.76      3137
   macro avg       0.76      0.76      0.76      3137
weighted avg       0.76      0.76      0.76      3137



In [15]:
# Decision Tree 모델 학습
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# 예측
y_pred = dt_model.predict(X_test)

# 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.771437679311444
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1569
           1       0.77      0.78      0.77      1568

    accuracy                           0.77      3137
   macro avg       0.77      0.77      0.77      3137
weighted avg       0.77      0.77      0.77      3137

