In [1]:
# pip install pandas  - read_csv함수를 가져오기 위해서 사용합니다.
# labeled_train_data.tsv: 결과를 이미 가지고 있는 트레이닝 데이터를 불러옵니다.

import pandas as pd  # pandas 라이브러리를 pd라는 이름으로 불러온다(약자)
train = pd.read_csv("data/labeledTrainData.tsv", header=0,
                   delimiter="\t", quoting=3)
# header, delimiter, quoting은 read_csv의 옵션이라고  생각하면 됩니다.
# header = 0; 첫행이 데이터의 이름을 나타내고 있다.
# delimiter =  '\t'; '\t(tab)'으로 데이터가 구분되고 있다.
# quoting = 3; 데이터에 포함된 쌍따옴표를 무시한다.

In [2]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
train.shape

(25000, 3)

In [4]:
# review 데이터 확인하기
print(train["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [5]:
# 데이터가 html문서 형식으로 되어있으므로 BeautifulSoup4를 통해 텍스트를 추출한다.
from bs4 import BeautifulSoup

# 리뷰 한 건에 대해서 후처리
# 데이터를 인터넷에서 그대로 수집해오는 경우, html태그가 포함되어 있다.
example1 =  BeautifulSoup(train["review"][0])
print(example1.text)  # html 태그가 삭제된 리뷰내용 출력

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi

In [6]:
import re
# 정규화 표현식을 활용하여 기사에서 <글자만 분리>합니다.
# 문장부호를 모두 삭제합니다.
letters_only = re.sub("[^a-zA-Z]",  " ", example1.get_text())
print(letters_only)

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

In [7]:
lower_case = letters_only.lower() # 글자를 모두 소문자로 변형합니다.
words = lower_case.split() # 문자열을 특정문자 기준으로 분리하여 리스트에 저장(default: 공백)

print(words)

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'all', 'about', 

In [8]:
# pip install nltk
from nltk.corpus import stopwords # 자연어 처리 라이브러리 nltk에서 stopwords가져오기
print(stopwords.words("english"))

# stopwords는 텍스트 데이터의 내용에 영향을 미치지 않는 불용어입니다.

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# words 리스트에서 stop words(불용어)삭제
words = [w for w in words if not w in stopwords.words("english")]
print(words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

In [10]:
def review_to_words(raw_review):
    # 수집한 데이터를 정규화된 문자 리스트로 변환해주는 함수
    # 입력: 문자열(수집한 리뷰 데이터)
    # 출력: 문자열(정규화된 리뷰 데이터(단어), 공백으로 구분)
    # stopwords를 제거하고 싶지 않다면, 5, 6번 과정을 제거합니다.
    
    #1. HTML 태그 제거하기
    review_text = BeautifulSoup(raw_review).get_text()
    
    #2. 문장부호 제거하기
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    #3. 소문자화 시키기(대문자 제거하기)
    words = letters_only.lower()
    
    #4. 리스트로 변경(공백을 기준으로 리스트로 바꾸기)
    words =  words.split()
    
    #5. set으로 변환(파이썬에서 set의 검색속도가 list보다 빠르기 때문에 set으로 변환)
    # {set}: 리스트와 동일하나, 중복과 순서가 없다.
    stops = set(stopwords.words("english"))
    
    #6. 문자 리스트에서 stopwords를 제거
    meaningful_words = [w for w in words if not w in stops]
    
    #7. stopwords가 제거된 문자리스트를 공백 기준으로 병합
    result = " ".join(meaningful_words)
    
    return result

In [11]:
# 해당하는 텍스트 데이터를 정제해줍니다.
clean_review = review_to_words(train["review"][0])
print(clean_review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [12]:
# 데이터의 갯수를 저장한다.("review"데이터의 갯수)
num_review = train["review"].size

# 정규화된 데이터를 저장하기 위한 리스트 초기화
clean_train_review = list()

# 데이터의 갯수만큼 반복하며 모든 기사데이터를 정규화합니다.
for i in range(num_review):
    # 1,000개 단위로 현재 정규화 진행상황 출력
    if (i+1)%1000 == 0:
        print("Review {0} of {1}".format(i+1, num_review))
    clean_train_review.append(review_to_words(train["review"][i]))

Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [13]:
print("Creating the bag of words...")
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer는 scikit-learn의 bag of words 구현 도구입니다.
# CountVectorizer 오브젝트를 초기화합니다.

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)
# max_features: 텍스트 분석에 활용할 최대 단어의 수를 결정해줍니다. 너무 많으면 과적합될 가능성이 있음

# fit_transform()은 두가지 역할을 수행합니다.
# 첫번째. 단어를 학습시킵니다.
# 두번째. training data를 feature vector로 변환합니다.
# 입력값은 문자열 리스트여야만 합니다.
train_data_features = vectorizer.fit_transform(clean_train_review)
# feature vector를 array형태로 변환해줍니다.
# 향 후에 NUMPY를 활용하기 위해서는 array 형태로 변환해주는 것이 유리합니다.
# 연산속도가 매우 빨라집니다.
train_data_features = train_data_features.toarray()

Creating the bag of words...


In [14]:
print(train_data_features.shape)
# 25,000개의 텍스트에 대한 5,000개의 단어 

(25000, 5000)


In [15]:
# 생성된 feature(단어 확인하기) 선택
vocab = vectorizer.get_feature_names()
print(vocab)



In [17]:
import numpy as np

# 생성된 featrue의 갯수 확인하기(25,000개 텍스트 전체)
dist = np.sum(train_data_features, axis=0)

# training set에 포함되어있는 단어와 빈도를 출력합니다.
# zip: 크기가 같은 두 리스트를 합쳐주는 역할을 합니다.

feature_cnt = pd.DataFrame({
    'vocab':vocab,
    'count':dist
})
feature_cnt

Unnamed: 0,vocab,count
0,abandoned,187
1,abc,125
2,abilities,108
3,ability,454
4,able,1259
5,abraham,85
6,absence,116
7,absent,83
8,absolute,352
9,absolutely,1485


In [None]:
# RandomForest모델을 활용하여 분류하기!!
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Random Forest 분류기를 100가지 트리로 초기화합니다.
forest = RandomForestClassifier(n_estimators=100)

# forest를 training set으로 학습시키고 bag of words를 feature로
# sentiment label을 종속 변수로 설정합니다.

# 실행하는데 시간이 소요됩니다.
forest = forest.fit(train_data_features, train["sentiment"])
print("Complete Learning")

Training the random forest...


In [None]:
# test data 읽어오기(예측 대상 데이터)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)

# test data의 모양 확인
print(test.shape)

In [None]:
# 빈 리스트를 만들어서 처리된 review를 저장합니다.
num_reviews = len(test["review"])
clean_test_reviews = list()

# review data 정규화하기
print("Cleaning and parsing the test set movie reviews...\n")

for i in range(num_reviews):
    if (i+1)%1000 == 0:
        print("Review {} of {}".format(i+1, num_reviews))
        
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)
    
# test set의 bag of words를 구해서 array로 저장합니다.
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Random Forest를 사용하여 sentiment label을 예측합니다.
print("Prediting with the random forest...")
result = forest.predict(test_data_features)
print("Complete Prediction")

# 예측 결과를 pandas 폼으로 변경하여 저장합니다.
# id, sentiment
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})

# 예측결과를 csv형태로 저장합니다.
output.to_csv("Bag_of_Words_model.csv", index=False, quoting=3)
print("--END--")