In [79]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error
# import seaborn as sns 


In [80]:
df = pd.read_csv('data.csv', encoding='utf-8-sig', index_col=0)

In [81]:
print(df.dtypes)
df.describe()

region            int64
education         int64
jobtype           int64
cptype            int64
sales           float64
employees       float64
aversalary      float64
capital         float64
pros_encoded    float64
dtype: object


Unnamed: 0,region,education,jobtype,cptype,sales,employees,aversalary,capital,pros_encoded
count,3015.0,3015.0,3015.0,3015.0,2253.0,2482.0,2149.0,2303.0,3015.0
mean,0.764842,0.836153,0.482587,0.670315,2.887175,1.121039,0.154087,24.992462,0.153018
std,0.808932,1.170736,0.49978,0.817098,7.976932,3.190629,0.750271,162.426437,0.609906
min,0.0,0.0,0.0,0.0,-0.1302,-0.2092,-1.7826,-0.113,-0.5
25%,0.0,0.0,0.0,0.0,-0.1252,-0.1876,-0.3043,-0.10655,-0.35
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,1.0,1.0,0.8748,0.812475,0.6957,0.8935,0.65
max,2.0,5.0,1.0,2.0,95.8569,24.7907,2.3913,5966.2584,2.8


### VIF 확인 & Lasso 모델을 통한 feature selection

In [82]:
# NaN 처리
df_droppped_na = df.dropna()
X = df_droppped_na.iloc[:, 1:]
y = df_droppped_na['region']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=747)

linear = Lasso(alpha=0.2)
linear.fit(X_train, y_train)

# feature selection -> sales, employees, capital
print(linear.coef_)

print("MAE (train set): ", np.round(mean_absolute_error(linear.predict(X_train), y_train), 4))
print("MAE: ", np.round(mean_absolute_error(linear.predict(X_test), y_test), 4))

for feature in range(len(df.columns[1:])):
    print(np.round(variance_inflation_factor(X_train, feature), 4))

[0.         0.         0.         0.00197754 0.01219089 0.
 0.00697038 0.        ]
MAE (train set):  0.7305
MAE:  0.7509
1.7492
1.6546
3.0912
3.7792
3.7302
1.5755
2.4854
2.286


In [105]:
df_droppped_na = df.dropna()

print(df_droppped_na['region'].value_counts())

X = df_droppped_na.iloc[:, 1:]
y = df_droppped_na['region']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=747)

clf = GradientBoostingClassifier(n_estimators=100, 
                                 learning_rate=0.2, 
                                 max_depth=1, 
                                 random_state=505).fit(X_train, y_train)
print(np.round(clf.score(X_test, y_test), 4))

region
0    780
1    440
2    432
Name: count, dtype: int64
0.4773


### with feature selected data

In [104]:
df_reduced = df[['region', 'sales', 'employees', 'capital']]
df_reduced = df_reduced.dropna()

print(df_reduced['region'].value_counts())

X = df_reduced.iloc[:, 1:]
y = df_reduced['region']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=747)

clf = GradientBoostingClassifier(n_estimators=100, 
                                 learning_rate=0.2, 
                                 max_depth=1, 
                                 random_state=505).fit(X_train, y_train)

# Return the mean accuracy on the given test data and labels.
print(np.round(clf.score(X_test, y_test), 4))
clf.get_params()

region
0    964
1    531
2    513
Name: count, dtype: int64
0.5224


{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.2,
 'loss': 'log_loss',
 'max_depth': 1,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 505,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

### 할일
데이터 불균형 해결한 뒤 다시 학습&점수 확인 -> sklearn.utils.random.sample_without_replacement  
nan 값 그대로 둔 채 xgboost 로 학습 & 점수 확인  

피쳐 스케일 최대한 얼추 맞춘 다음 클러스터링 하기 -> 로그함수 적용