## Load data using Pandas 

In [1]:
import pandas as pd

In [16]:
# setiment(긍정/부정) label이 있는 학습 데이터
# header = 0 : 파일의 첫 번째 줄에 열 이름이 있음(id, sentiment)
# quoting=3 : ""(쌍따옴표) 무시 안함
train = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

In [4]:
test = pd.read_csv('data/testData.tsv', header=0, delimiter='\t', quoting=3)

## Check data information

데이터 속성(개수, null값, 타입) 확인

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


데이터 값 확인

In [17]:
train.head(5)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [9]:
test.head(3)

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."


Shape

In [12]:
print('train shape : {} \ntest shape : {}'.format(train.shape, test.shape))

train shape : (25000, 3) 
test shape : (25000, 2)


데이터 column 확인

In [18]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [19]:
test.columns.values

array(['id', 'review'], dtype=object)

In [21]:
# describe 는 숫자형에 대한 정보만 기본적으로 보여줌
train.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [23]:
train['review'][0][:1000]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

## Data Cleaning and Text Preprocessing
- 기계가 이해할 수 있도록 텍스트를 정제
- BeautifulSoup : html 태그 제거
- Regular Expression : 알파벳 이외의 문자를 공백으로 치환 
- NLTK 데이터를 사용하여 불용어(Stopword)제거
    * 불용어 : 단어 출현횟수는 높지만, 별로 의미가 없는 것 the, i, is 
- 어간추출(스태밍)과 음소표기법(lemmatizing)개념 이해 및 snowballStemmer를 통해 어간 추출
- 한국어
![image](https://user-images.githubusercontent.com/33097467/45873687-033c5b80-bdce-11e8-8ced-0e77e5733951.png)

Html Tag 제거

In [24]:
from bs4 import BeautifulSoup as bs

In [26]:
example1 = bs(train['review'][0], "html5lib")
print(train['review'][0][:700])
example1.get_text()[:700]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik


'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

정규표현식을 이용하여 특수문자 제거

In [27]:
import re
letters_only = re.sub('[^a-zA-Z]', ' ', example1.get_text())

In [30]:
letters_only[:1000]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

소문자 변환
- 같은 문자라도 소문자/대문자에 따라서 다른 문자로 인식할 수 있기 때문에

In [31]:
lower_cas = letters_only.lower()

토큰화

In [49]:
words = lower_cas.split()

In [50]:
print(len(words))
print(words[:100], end=' ')

437
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press'] 

불용어 제거(Stopword Removal)
- NLTK의 stopwords 사용
- 일반적으로 Corpus에 자주 나타나는 단어는 모델로서 학습이나 예측 프로세스에 기여하지 않는다 <br>
the, this, are, is 등의 단어는 빈번하게 등장하지만 실제 의미를 찾는데 큰 기여를 하지 않는다 <br>
NLTK에는 153개의 영어 불용어가 미리 정의되어 있다. 17개의 언어에 대해 정의되어있고 한국어는 없다 :(

In [36]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [51]:
print('원래 tokenizing 개수 : {}'.format(len(words)))
words = [w for w in words if not w in stopwords.words('english')]
print('stopword를 적용하고 난 후의 tokenizing 개수 : {}'.format(len(words)))

원래 tokenizing 개수 : 437
stopword를 적용하고 난 후의 tokenizing 개수 : 219


In [52]:
print(words[:100], end=' ')

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful'] 

어간 추출(Stemming)
- NLTK : PorterStemmer(보수적), LancasterStemmer(적극적)
- 어형이 변형된 단어로부터 접사 등을 제거하고 그 단어의 어간을 분리해 내는 것을 의미
- cats, catty, catlike --> cat
- walks, walking, walked --> work

PorterStemmer

In [43]:
stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem('maximum'))
print('Porter - running : {}'.format(stemmer.stem('running')))
print('Porter - runs : {}'.format(stemmer.stem('runs')))
print('Porter - run : {}'.format(stemmer.stem('run')))

maximum
Porter - running : run
Porter - runs : run
Porter - run : run


LancasterStemmer

In [42]:
stemmer = nltk.stem.LancasterStemmer()
print(stemmer.stem('maximum'))
print('Lancaster - running : {}'.format(stemmer.stem('running')))
print('Lancaster - runs : {}'.format(stemmer.stem('runs')))
print('Lancaster - run : {}'.format(stemmer.stem('run')))

maxim
Lancaster - running : run
Lancaster - runs : run
Lancaster - run : run


In [53]:
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [54]:
# SnowballStemmer 사용
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

words_stemmer = [stemmer.stem(w) for w in words]
words_stemmer[:10]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari']

음소표기법(Lemmatization)
- 단어의 보조 정리 또는 사전 형식에 의해 식별되는 단일 항목으로 분석 될 수 있도록 굴절 된 형태의 단어를 그룹화하는 과정 
- 앞뒤 문맥을 보고 단어의 의미를 식별
- meeting : 회의 / meet : 만나다 --> 명사/동사 인지에 따라 적합한 의미를 갖도록

In [48]:
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

print(lemma.lemmatize('fly'))
print(lemma.lemmatize('flies'))

fly
fly


In [55]:
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [56]:
words_lemma = [lemma.lemmatize(w) for w in words]
words_lemma[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

### 문자열 처리 함수

In [60]:
def review_to_words(raw_review):
    # 1. HTML 태그 제거
    review_text = bs(raw_review, 'html.parser').get_text()
    # 2. Regex - 영문자가 아닌 문자는 공백으로 변환
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. 소문자 변환 + tokenizing
    words = letters_only.lower().split()
    # 4. 불용어 제거 (불용어를 set로 변환)
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    # 5. 어간 추출(stemming)
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    #6. 공백으로 구분된 문자열로 결합하여 결과를 반환
    return ' '.join(stemming_words)

In [61]:
clean_review = review_to_words(train['review'][0])
clean_review

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obvious messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay

## Running code of total train set 

In [64]:
num_reviews = len(train)
print(num_reviews)

25000


In [65]:
clean_train_reviews = []
for i in range(0, num_reviews):
    if (i+1) % 5000 == 0:
        print('{}개 문자열 처리 완료'.format(i+1))
    clean_train_reviews.append(review_to_words(train['review'][i]))

5000개 문자열 처리 완료
10000개 문자열 처리 완료
15000개 문자열 처리 완료
20000개 문자열 처리 완료
25000개 문자열 처리 완료


In [66]:
apply_clean_review = []
%time apply_clean_review = train['review'].apply(review_to_words)

Wall time: 1h 8min 43s


## Multiprocessing

In [None]:
from multiprocessing import Pool
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    
    return pd.concat(list(result))