In [3]:
import pandas as pd
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer


korean_air_data = pd.read_csv("textmining/korean_air_data/\
korean_air_2020_01.csv")

korean_air_data 

korean_air_data.columns = ["a","title","text","editor","times","date"]

del korean_air_data["a"]

# na값 제거
data = korean_air_data.dropna()

# 불용어 지정
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','을'\
             ,'것','를','등','으로','자','에','등','와','한','하다''것'\
             ,'\n','로']

# 한글 외의 모든 데이터 제거 : 기업명이 영어로 되어있는 경우를 어떻게 처리 해야할까?
data = data.text.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer

okt = Okt()
X_train = []

for sen in data :
    temp_X = []
    
    # 토큰화
    temp_X = okt.morphs(sen, stem = True) 
    
    # 불용어 제거
    temp_X = [word for word in temp_X if not word in stopwords] 
    
    X_train.append(temp_X)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

threshold = 3
# 단어의 수
total_cnt = len(tokenizer.word_index) 
# 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
rare_cnt = 0 
# 훈련 데이터의 전체 단어 빈도수 총 합
total_freq = 0
# 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합
rare_freq = 0 

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:",\
      (rare_freq / total_freq)*100)

# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
vocab_size = total_cnt - rare_cnt
print('단어 집합의 크기 :',vocab_size)
    

단어 집합(vocabulary)의 크기 : 8648
등장 빈도가 2번 이하인 희귀 단어의 수: 4170
단어 집합에서 희귀 단어의 비율: 48.21924144310824
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 3.497673817523908
단어 집합의 크기 : 4478


In [3]:
nouns = []
str = ''

for k in range(0,len(X_train)):
    for i in range(0,len(X_train[k])):
        str = str + X_train[k][i] + ' '
    nouns.append(str)

In [4]:
tfidf_df = pd.DataFrame()

tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfidf = tfIdfVectorizer.fit_transform(nouns)

for i in range(0,len(X_train)):
    df = pd.DataFrame(tfidf[i].T.toarray(), \
                      index=tfIdfVectorizer.get_feature_names(),\
                      columns=["text{}".format(i)])
    tfidf_df = pd.concat([tfidf_df,df],axis = 1)

In [None]:
nouns = []
str = ''

for k in range(0,len(X_train)):
    for i in range(0,len(X_train_pre[k])):
        str = str + X_train_pre[k][i] + ' '
    nouns.append(str)

In [None]:
tfidf_df = pd.DataFrame()

tfIdfVectorizer = TfidfVectorizer(use_idf=True)
tfidf = tfIdfVectorizer.fit_transform(nouns)

for i in range(0,len(X_train)):
    df = pd.DataFrame(tfidf[i].T.toarray(),\
                      index=tfIdfVectorizer.get_feature_names(),\
                      columns=["text{}".format(i)])
    tfidf_df = pd.concat([tfidf_df,df],axis = 1)

In [7]:
tfidf_df

Unnamed: 0,text0,text1,text2,text3,text4,text5,text6,text7,text8,text9,...,text422,text423,text424,text425,text426,text427,text428,text429,text430,text431
가가,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.002120,0.002115,0.002109,0.002107,0.002104,0.002098,0.002092,0.002090,0.002074,0.002071
가게,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000145,0.000145,0.000144,0.000144,0.000144,0.000144,0.000143,0.000143,0.000142,0.000142
가격,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.004429,0.003859,0.003508,0.002946,...,0.008832,0.008809,0.008788,0.008778,0.008764,0.008740,0.008717,0.008708,0.008640,0.008628
가공,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000465,0.000464,0.000463,0.000463,0.000462,0.000461,0.000459,0.000459,0.000455,0.000455
가구,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.001095,0.001092,0.001089,0.001088,0.001086,0.001083,0.001081,0.001079,0.001071,0.001070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힐링,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000354,0.000353,0.000353,0.000352,0.000352,0.000351,0.000350,0.000349,0.000347,0.000346
힘겹다,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000329,0.000328,0.000327,0.000327,0.000326,0.000325,0.000325,0.000324,0.000322,0.000321
힘드다,0.000000,0.000000,0.00000,0.000000,0.000000,0.005538,0.004419,0.003850,0.003500,0.002939,...,0.000953,0.000950,0.000948,0.000947,0.000945,0.000943,0.000940,0.000939,0.000932,0.000931
힘들다,0.024992,0.016402,0.02454,0.018793,0.015784,0.010948,0.008736,0.007611,0.006920,0.008716,...,0.001177,0.001174,0.001171,0.001170,0.001168,0.001165,0.001162,0.001161,0.001151,0.001150


In [5]:
X_train_tfidf = []

for i in range(0,len(X_train)):
    idx = tfidf_df['text{}'.format(i)] > 0.01
    result = tfidf_df[idx].index.tolist()
    X_train_tfidf.append(result)

len(X_train_tfidf)

432

In [15]:
tfidf_df.to_excel(r'korean_air_01_tfidf.xlsx',encoding='UTF8')