In [68]:
import numpy as np
import pandas as pd
from konlpy.tag import Mecab
from tqdm import tqdm
from collections import Counter
import itertools # 2차원 배열 1차원 배열로
from sklearn.feature_extraction.text import TfidfVectorizer
import os #폴더 이름 가져오기

class Preprocessor:
    def __init__(self, filepath, stopwords_file):
        self.filepath = filepath
        self.stopwords_file = stopwords_file
        self.stopwords = self.load_stopwords()
        self.file_list = self.choice_file()
        self.files = self.load_files()

    def load_stopwords(self):
        with open(self.stopwords_file, encoding='utf-8') as f:
            stopwords = [line.strip() for line in f]
        return stopwords

    def choice_file(self):
        file_list = os.listdir(self.filepath)
        cnt=0
        for i in file_list:
            i=self.filepath+"/"+i
            file_list[cnt]=i
            cnt+=1
        return file_list

    def load_files(self):
        files=[]
        for i in self.file_list:
            try:
                df=pd.read_csv(i,encoding="UTF-8") # 크롤링한 파일 가져오기
                files.append(df)
            except:
                pass #.ipynb_checkpoints 파일때문
        return self.delete(files)

    def delete(self, files):
        result=[]
        for df in files:
            df=df.replace("- dc official App","") # 글 내용없을때 들어가는 "- dc official App" 대체
            df=df.fillna('') # 결측치 제거
            df['data']=df['title']+ " " +df['content'] # 제목+내용을 하나로 합쳐서 'data'로 저장 
            del df['title'],df['content'],df['no'] # 제목,내용,글번호 삭제
            data=df['data']
            result.append(df)
        return result

    def data_processing(self):
        for file in self.files:
            file['data']=file['data'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규화
            file['data']=file['data'].str.replace('^ +', "") # 공백은 empty값으로 변경
            file['data'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
            file['data'].dropna(how='any',inplace=True) # Null 값 제거

    def data_processing_stopword(self):
        m=Mecab()
        result=[]
        print("모든 파일 불용어 제거중...")
        for file in self.files:
            rs=[]
            for i in tqdm(file['data']):
                tokenized_sentence =m.morphs(str(i))
                stopwords_removed_sentence=[word for word in tokenized_sentence if not word in self.stopwords]
                rs.append(stopwords_removed_sentence)
            result.append(rs)
        return self.remove_one_word(result)

    def remove_one_word(self, result):
        one_word_remove=[]
        for res in result:
            temp=[]
            for word in res:
                tmp=[]
                for i in word:
                    if len(i)>1:
                        tmp.append(i)
                temp.append(tmp)
            one_word_remove.append(temp)
        print("한 글자 제외 완료")
        return self.filter_pos(one_word_remove)
    
    def filter_pos(self, one_word_remove):
        print("Mecab 실행중...")
        result=[]
        m=Mecab()
        for onefile in tqdm(one_word_remove):
            rs=[]
            for word in onefile:
                tmp=[]
                for i in word:
                    if m.pos(i)[0][1]=='NNG' or m.pos(i)[0][1]=='NNB' or m.pos(i)[0][1]=='NNBC' or m.pos(i)[0][1]=='NR' or m.pos(i)[0][1]=='NP' or m.pos(i)[0][1]=='VX' or m.pos(i)[0][1]=='VCP' or m.pos(i)[0][1]=='VCN' or m.pos(i)[0][1]=='MAJ'or m.pos(i)[0][1]=='VV' or m.pos(i)[0][1]=='VA' or m.pos(i)[0][1]=='MMG':
                        tmp.append(m.pos(i)[0][0])
                rs.append(tmp)
            result.append(rs)
        return result

In [69]:
import numpy as np

# 데이터가 저장된 폴더 경로
filepath = '/Users/home/jp/1일전일봉고점'

stopwords_file=('/Users/home/jp/stopwords.txt')

# 클래스 인스턴스화
preprocessor = Preprocessor(filepath, stopwords_file)

# 데이터 전처리
preprocessor.data_processing()

# 결과
result=preprocessor.data_processing_stopword()



  file['data']=file['data'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규화
  file['data']=file['data'].str.replace('^ +', "") # 공백은 empty값으로 변경


불용어 제거중...


100%|████████████████████████████████████| 5691/5691 [00:00<00:00, 16754.11it/s]
100%|████████████████████████████████████| 4796/4796 [00:00<00:00, 20116.77it/s]
100%|████████████████████████████████████| 2760/2760 [00:00<00:00, 18634.03it/s]
100%|██████████████████████████████████████| 994/994 [00:00<00:00, 15893.51it/s]
100%|████████████████████████████████████| 5044/5044 [00:00<00:00, 14949.05it/s]
100%|████████████████████████████████████| 4737/4737 [00:00<00:00, 17475.44it/s]
100%|████████████████████████████████████| 4882/4882 [00:00<00:00, 17116.32it/s]
100%|██████████████████████████████████| 10535/10535 [00:00<00:00, 18235.22it/s]
100%|████████████████████████████████████| 2794/2794 [00:00<00:00, 18562.95it/s]
100%|████████████████████████████████████| 1505/1505 [00:00<00:00, 14644.75it/s]
100%|████████████████████████████████████| 2609/2609 [00:00<00:00, 17527.60it/s]


한 글자 제외 완료
Mecab 실행중...


100%|███████████████████████████████████████████| 11/11 [00:09<00:00,  1.11it/s]


In [None]:
result

[[['무섭'],
  ['보너스', '보너스', '올리', '근데', '차트', '미치'],
  [],
  ['발표', '어치', '여여'],
  ['천만'],
  ['오늘', '급등', '대장', '누구'],
  [],
  ['시발'],
  ['아무'],
  [],
  ['기회'],
  ['메타', '자연재해'],
  ['조금', '슈팅', '손절'],
  [],
  ['속보', '음전자', '망토', '구입'],
  ['시아', '시아', '준수', '탑승'],
  ['오늘'],
  [],
  ['멸망'],
  ['출발'],
  ['시바', '발표'],
  ['척하'],
  ['발아'],
  ['제발'],
  ['블록'],
  ['다음', '기후', '코인', '다음', '보'],
  ['상장', '상장', '지갑', '연동', '상장'],
  ['썸씽'],
  ['양반', '출발', '부자'],
  ['블록'],
  ['코어', '운지'],
  ['제단', '실시간', '발표', '영상'],
  ['썸씽'],
  ['제발'],
  ['개자'],
  ['정조대', '무엇', '절대', '뚫리', '강철', '정조대'],
  ['세력', '코인', '이동', '폭락'],
  ['버러지', '코인'],
  ['폭풍', '저그'],
  ['엑스', '기사', '고마', '역시', '내촉', '근데'],
  ['블록', '고점', '이거'],
  ['쿼크', '기후', '변화', '지리', '다음'],
  ['시즌'],
  [],
  ['경주마', '새끼', '필독', '나락'],
  ['쿼크'],
  ['블록', '고점'],
  [],
  [],
  ['방구', '손절', '가망'],
  ['파워', '대가리', '뜨겁'],
  ['개추'],
  ['사라', '개저', '선동'],
  ['세력', '악물', '개미', '털기', '발표', '나기', '이렇'],
  [],
  ['블록'],
  ['쿼크', '어제', '계속', '가지'],
  ['고맙', '덕분