In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
from scipy.stats import shapiro
from scipy.stats.mstats import winsorize
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
df = pd.read_csv('./dataset/label지정.csv', encoding='CP949')                        ## 성휘 파일 경로
# df = pd.read_excel('../eeez/전처리파일/label지정.xlsx', index_col=0)                ## 재인 파일 경로
# df = pd.read_csv('../데이터자료/label지정.csv', encoding='CP949')                   ## 의영 파일 경로
# df = pd.read_excel('./csv/dateset2_label최종.xlsx', index_col=0)                   ## 경록 파일 경로
df

## Winsorizing

In [4]:
df_1 = df.copy()
df_1.drop(['회사명', '거래소코드', '회계년도_x', '상장일', '상장폐지일', '산업명', '소속코드','year', 'month'],axis=1,inplace=True)
for col in df_1.columns:
    df_1[col] = winsorize(df_1[col], limits= 0.01)

# Ridge

In [45]:

feature = df_1.drop(columns='label')
target = df_1['label']

#릿지 클래서
ridge = Ridge(alpha = 10)

neg_mse_scores = cross_val_score(ridge, feature, target, scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print(neg_mse_scores,rmse_scores,avg_rmse)

ridge_alphas =[1000,2000,3000,4000,5000]
for alpha in ridge_alphas :
    ridge = Ridge(alpha = alpha)
    neg_mse_scores = cross_val_score(ridge, feature, target, scoring="neg_mean_squared_error", cv=5)
    avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
    print('alpha {0} 일 떄 5folds 의 평균 RMSE : {1:.10f}' .format(alpha,avg_rmse))

# fig, axs = plt.subplots(figsize = (18,6), nrows = 1, ncols=5)

coeff_df = pd.DataFrame()

for pos, alpha in enumerate(ridge_alphas):
    ridge = Ridge(alpha= alpha)
    ridge.fit(feature,target)
    coeff = pd.Series(data=ridge.coef_, index=feature.columns)
    colname = 'alpha:' + str(alpha)
    coeff_df[colname] = coeff

    coeff = coeff.sort_values(ascending=False)
#     axs[pos].set_title(colname)
#     axs[pos].set_xlim(-3,6)
#     sns.barplot(x=coeff.values, y=coeff.index, ax =axs[pos])

# plt.show()


[-0.01082428 -0.01583049 -0.01147362 -0.01493988 -0.01184755] [0.10403978 0.12581929 0.10711499 0.12222879 0.10884643] 0.11360985508773096
alpha 1000 일 떄 5folds 의 평균 RMSE : 0.1136160039
alpha 2000 일 떄 5folds 의 평균 RMSE : 0.1136430915
alpha 3000 일 떄 5folds 의 평균 RMSE : 0.1136578420
alpha 4000 일 떄 5folds 의 평균 RMSE : 0.1136674002
alpha 5000 일 떄 5folds 의 평균 RMSE : 0.1136740411


## Alpha 값 조정하면서 최적의 Alpha값 찾기

In [46]:
ridge_alphas =[1000,2000,3000,4000,5000]
sort_column = 'alpha:'+str(ridge_alphas[0])
coeff_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:1000,alpha:2000,alpha:3000,alpha:4000,alpha:5000
[제조]순운전자본회전률,0.006977861,0.004014931,0.002757899,0.002065798,0.001630485
[제조]총자본회전률,0.005419411,0.002789613,0.001781359,0.001260533,0.0009483412
[제조]비유동자산회전률,0.00182172,0.001709879,0.001558423,0.001417368,0.001293263
[제조]운전자본회전률,0.001546809,0.001201184,0.0009796866,0.0008266811,0.0007147191
[제조]유동자산회전률,0.000966433,0.0007523543,0.0004811121,0.0002892155,0.0001565565
[제조]매출채권회전률,0.0007203055,0.0007190718,0.0007181074,0.0007169244,0.000715556
[제조]유동자산구성비율,0.0006964391,0.0006344446,0.0005993786,0.0005766739,0.0005606557
[제조]총자본사업이익률,0.000428135,0.0004462992,0.0004536193,0.0004567984,0.0004579161
[제조]자기자본구성비율,0.0003568564,0.0003701757,0.000374549,0.0003759478,0.0003760159
[제조]1회전기간,0.0001486242,7.881659e-05,5.380568e-05,4.084241e-05,3.291937e-05


# Lasso

In [47]:
lasso = Lasso(alpha = 10)
lasso_alphas =[0,0.1,1,10,100]

neg_mse_scores = cross_val_score(lasso, feature, target, scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print(neg_mse_scores,rmse_scores,avg_rmse)

# lasso_alphas =[100,10000,10000]
for alpha in lasso_alphas :
    lasso = Lasso(alpha = alpha)
    neg_mse_scores = cross_val_score(lasso, feature, target, scoring="neg_mean_squared_error", cv=5)
    avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
    print('alpha {0} 일 떄 5folds 의 평균 RMSE : {1:10f}' .format(alpha,avg_rmse))

# fig, axs = plt.subplots(figsize = (18,6), nrows = 1, ncols=5)

coeff_df = pd.DataFrame()

for pos, alpha in enumerate(lasso_alphas):
    lasso = Lasso(alpha = alpha)
    lasso.fit(feature,target)
    coeff = pd.Series(data=lasso.coef_, index=feature.columns)
    colname = 'alpha:' + str(alpha)
    coeff_df[colname] = coeff

    coeff = coeff.sort_values(ascending=False)

[-0.01123924 -0.01719316 -0.01216996 -0.01538518 -0.01266459] [0.10601528 0.13112268 0.11031756 0.124037   0.11253707] 0.1168059155628568


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


alpha 0 일 떄 5folds 의 평균 RMSE :   0.113651
alpha 0.1 일 떄 5folds 의 평균 RMSE :   0.113772
alpha 1 일 떄 5folds 의 평균 RMSE :   0.115911
alpha 10 일 떄 5folds 의 평균 RMSE :   0.116806
alpha 100 일 떄 5folds 의 평균 RMSE :   0.116822


  lasso.fit(feature,target)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


## Alpha 값 조정하면서 최적의 Alpha값 찾기

In [48]:
lasso_alphas =[0,0.1,1,10,100]

sort_column = 'alpha:'+str(lasso_alphas[0])
coeff_df.sort_values(by=sort_column, ascending=False)


Unnamed: 0,alpha:0,alpha:0.1,alpha:1,alpha:10,alpha:100
[제조]총자본회전률,0.03144116,-0.0,-0.0,-0.0,-0.0
[제조]순운전자본회전률,0.02115825,-0.0,-0.0,-0.0,-0.0
[제조]1회전기간,0.01614939,0.0,0.0,0.0,0.0
[제조]운전자본회전률,0.001999664,0.0,0.0,0.0,0.0
[제조]비유동자산회전률,0.0009711364,-0.0,-0.0,-0.0,-0.0
[제조]유동자산구성비율,0.0007638032,-0.0,-0.0,-0.0,-0.0
[제조]매출채권회전률,0.0007263543,0.0,0.0,0.0,0.0
[제조]총자본사업이익률,0.0003477668,-0.0,-0.0,-0.0,-0.0
[제조]자기자본구성비율,0.0002814893,-0.0,-0.0,-0.0,-0.0
[제조]부채비율,0.0001728272,8.067027e-05,5.189188e-05,0.0,0.0


In [50]:

# a = coeff_df[coeff_df["alpha:0.25"]<-0.0000000001] 
# b = coeff_df[coeff_df["alpha:0.25"]>0.00000001]

# a.info()

In [None]:
# feature = pd.concat([a,b])
# feature = feature["alpha:0.25"]
# feature = pd.DataFrame(feature)
# feature

Unnamed: 0,alpha:0.25
[제조]비유동자산증가율,-1.101468e-05
[제조]재고자산증가율,-1.642654e-05
[제조]매출액순이익률,-0.0002140773
[제조]자기자본순이익률,-0.000147727
[제조]1주당매출액,-6.825699e-08
[제조]유동비율,-5.829955e-06
[제조]자기자본배율,-2.152229e-06
[제조]부가가치,-1.20822e-07
[제조]설비투자효율,-1.39324e-06
[제조]기계투자효율,-4.364865e-09


# ElasticNet

In [20]:
from sklearn.linear_model import ElasticNet
elasticNet = ElasticNet(alpha = 10)
alphas =[0.07,0.1,0.5,1,3]


# fig, axs = plt.subplots(figsize = (18,6), nrows = 1, ncols=5)

coeff_df = pd.DataFrame()

for pos, alpha in enumerate(alphas):
    
    elasticNet = ElasticNet(alpha = alpha, l1_ratio=0.07)
    elasticNet.fit(feature,target)
    coeff = pd.Series(data=elasticNet.coef_, index=feature.columns)
    colname = 'alpha:' + str(alpha)
    coeff_df[colname] = coeff

    coeff = coeff.sort_values(ascending=False)

## Alpha 값 조정하면서 최적의 Alpha값 찾기

In [49]:
elasticNet_alphas =[0,0.1,1,10,100]

sort_column = 'alpha:'+str(elasticNet_alphas[0])
coeff_df.sort_values(by=sort_column, ascending=False)


Unnamed: 0,alpha:0,alpha:0.1,alpha:1,alpha:10,alpha:100
[제조]총자본회전률,0.03144116,-0.0,-0.0,-0.0,-0.0
[제조]순운전자본회전률,0.02115825,-0.0,-0.0,-0.0,-0.0
[제조]1회전기간,0.01614939,0.0,0.0,0.0,0.0
[제조]운전자본회전률,0.001999664,0.0,0.0,0.0,0.0
[제조]비유동자산회전률,0.0009711364,-0.0,-0.0,-0.0,-0.0
[제조]유동자산구성비율,0.0007638032,-0.0,-0.0,-0.0,-0.0
[제조]매출채권회전률,0.0007263543,0.0,0.0,0.0,0.0
[제조]총자본사업이익률,0.0003477668,-0.0,-0.0,-0.0,-0.0
[제조]자기자본구성비율,0.0002814893,-0.0,-0.0,-0.0,-0.0
[제조]부채비율,0.0001728272,8.067027e-05,5.189188e-05,0.0,0.0


In [None]:
# ridge_alphas =[0,0.1,0.2,0.3,0.4,0.5]
# sort_column = 'alpha:'+str(ridge_alphas)
# coeff_df.sort_values(by=sort_column, ascending=False)

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np              
# from sklearn.metrics import mean_squared_error

# plt.scatter(Y_train,Y_pred_train, label = '(실제값, 예측값)', alpha=.5)
# plt.plot(np.linspace(0,20,200), np.linspace(0,50,200), color = 'green', label = "45°(실제값 = 예측값)")
# plt.xlabel("실제 Price: $Y_i$")
# plt.ylabel("예측 Price : $\hat{Y}_i$")
# plt.legend()
# plt.title("예측된 Price of train set vs 실제 Price ($Y_i$ vs $\hat{Y}_i$), " + f"alpha={alpha_value}")
# plt.text(y=0,x=35, s=f'MSE 값 : {mean_squared_error(Y_train,Y_pred_train) : .4f}')
# plt.show()

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# from sklearn.metrics import mean_squared_error

# plt.scatter(Y_test,Y_pred, label = '(실제값, 예측값)', alpha=1)
# plt.plot(np.linspace(0,50,200), np.linspace(0,50,200), color = 'green', label = "45°(실제값 = 예측값)")
# plt.xlabel("실제 Price: $Y_i$")
# plt.ylabel("예측 Price: $\hat{Y}_i$")
# plt.legend()
# plt.title("예측 Price of test set vs 실제 Price ($Y_i$ vs $\hat{Y}_i$), " + f"alpha={alpha_value}")
# plt.text(y=0,x=35, s=f'MSE 값 : {mean_squared_error(Y_test,Y_pred) : .4f}')
# plt.show()

In [None]:
# # 잔차 분석
# resid = Y_pred_train - Y_train

# plt.scatter(Y_pred_train, resid, c='b')
# plt.hlines(y=0, xmin= -10, xmax=50)
# plt.title('Residual plot')
# plt.show()