In [9]:
import pandas as pd
import numpy as np

"""
header = 0 : 파일의 첫 번째 줄에 열 이름이 있음을 표시
delimiter = '\t' : 필드가 탭으로 구분되어 있음을 의미
quoting = 3 : 큰따옴표("")를 무시하도록 설정
"""

train = pd.read_csv('labeledTrainData.tsv', header=0,
                   delimiter = '\t', quoting = 3)

test = pd.read_csv('testData.tsv', header = 0, delimiter = '\t', quoting = 3)

print(train.shape)
print(test.shape)

(25000, 3)
(25000, 2)


In [3]:
# sentiment : 1 긍정 / 0 부정
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
sample = pd.read_csv('labeledTrainData.tsv', header = 0, delimiter = '\t')
sample.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [7]:
train.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [8]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
train.describe(exclude=[np.number]) # 문자 타입 describe하는 방법

Unnamed: 0,id,review
count,25000,25000
unique,25000,24904
top,"""7270_8""","""This show comes up with interesting locations..."
freq,1,3


### 데이터 정제

In [16]:
train['review'][0][:700]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik'

In [19]:
from bs4 import BeautifulSoup

examplel = BeautifulSoup(train['review'][0], "html5lib")
examplel.get_text()[:700]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

In [20]:
import re

letters_only = re.sub('[^a-zA-Z]', ' ', examplel.get_text())
letters_only

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

In [21]:
lower_case = letters_only.lower()

words = lower_case.split()
words[:10]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

In [22]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [23]:
words = [w for w in words if not w in stopwords.words('english')]
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [24]:
# nltk.stem.PorterStemmer() : 어간 추출(특정한 규칙을 가지고)
stemmer = nltk.stem.PorterStemmer()

print(stemmer.stem('maximum'))
print("The stemmed form of running is : {}".format(stemmer.stem("running")))
print("The stemmed form of runs is : {}".format(stemmer.stem("runs")))
print("The stemmed form of run is : {}".format(stemmer.stem("run")))

maximum
The stemmed form of running is : run
The stemmed form of runs is : run
The stemmed form of run is : run


In [25]:
Lancaster_stemmer = nltk.stem.LancasterStemmer()

print(Lancaster_stemmer.stem('maximum'))
print("The stemmed form of running is : {}".format(Lancaster_stemmer.stem("running")))
print("The stemmed form of runs is : {}".format(Lancaster_stemmer.stem("runs")))
print("The stemmed form of run is : {}".format(Lancaster_stemmer.stem("run")))

maxim
The stemmed form of running is : run
The stemmed form of runs is : run
The stemmed form of run is : run


In [26]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
words = [stemmer.stem(w) for w in words]
words[:10]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari']

In [28]:
def review_to_words(review):
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    letters_only = re.sub('^[a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    stop_words = [w for w in words if not w in stops]
    stem_words = [stemmer.stem(w) for w in stop_words]
    return (''.join(stem_words))

In [29]:
clean_review = review_to_words(train['review'][0])
clean_review

'"withstuffgomomentmji\'vstartlistenmusic,watchodddocumentarithere,watchwizwatchmoonwalkagain.maybwantgetcertaininsightguythoughtreallicooleightimaybmakemindwhetherguiltiinnocent.moonwalkpartbiography,partfeaturfilmremembgoseecinemaoriginreleased.subtlmessagmjfeeltowardpressalsoobviousmessagdrugbadm\'kay.visuimpresscoursmichaeljacksonunlessremotlikemjanywaygohatefindboring.maycallmjegotistconsentmakemovimjfanwouldsaymadefantruereallinicehim.thactualfeaturfilmbitfinalstart20minutexcludsmoothcriminsequencjoepesciconvincpsychopathpowerdruglord.wantmjdeadbadbeyondme.mjoverheardplans?nah,joepescicharactrantwantpeoplknowsupplidrugetcdunno,maybhatemjmusic.lotcoolthinglikemjturncarrobotwholespeeddemonsequence.also,directormustpatiencsaintcamefilmkiddibadsequencusualdirectorhateworkonekidletalonwholebunchperformcomplexdancscene.bottomline,movipeopllikemjonelevelanoth(whichthinkpeople).not,stayaway.trigivewholesommessagironmjbestestbuddimovigirl!michaeljacksontrulionetalentpeoplevergraceplanetgu

In [31]:
num_reviews = train['review'].size
num_reviews

25000

In [None]:
clean_train_reviews = []
for i in range