#### 필요한 모듈 import

In [1]:
import pandas as pd
import numpy as np
import warnings
import os
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
SEED = 33

#### 데이터 로드

In [2]:
DATA = '.\data'

In [3]:
train = pd.read_csv(os.path.join(DATA, 'train.tsv'), delimiter='\t')
test = pd.read_csv(os.path.join(DATA, 'test.tsv'), delimiter='\t')
unlabled_train = pd.read_csv(os.path.join(DATA, 'unlabeledTrain.tsv'), delimiter='\t', on_bad_lines='skip')


In [4]:
print(train.shape)
train.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
print(test.shape)
test.head()

(25000, 2)


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [6]:
print(unlabled_train.shape)
unlabled_train.head()

(49998, 2)


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [None]:
pip install nltk

In [8]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [9]:
eng_stopwords = stopwords.words('english')

In [10]:
sample = train['review'][0]
sample

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [11]:
soup = BeautifulSoup(sample, 'html.parser')
soup.text

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi

#### 정규표현식으로 특수 기호 지우기

In [12]:
import re

In [13]:
cleaned = re.sub('[^a-zA-Z]',' ', soup.text)

#### 전처리 함수 정의

In [14]:
def preprocessing(sentence):
    soup = BeautifulSoup(sentence, 'html.parser')
    cleaned = re.sub('[^a-zA-Z]',' ', soup.text)
    cleaned = cleaned.lower()
    cleaned = [word for word in cleaned.split() if word not in eng_stopwords]
    return ' '.join(cleaned)

In [15]:
preprocessing(sample)

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [16]:
all_review = pd.concat([train['review'], unlabled_train['review'], test['review']])
all_review

0        With all this stuff going down at the moment w...
1        \The Classic War of the Worlds\" by Timothy Hi...
2        The film starts with a manager (Nicholas Bell)...
3        It must be assumed that those who praised this...
4        Superbly trashy and wondrously unpretentious 8...
                               ...                        
24995    Sony Pictures Classics, I'm looking at you! So...
24996    I always felt that Ms. Merkerson had never got...
24997    I was so disappointed in this movie. I am very...
24998    From the opening sequence, filled with black a...
24999    This is a great horror film for people who don...
Name: review, Length: 99998, dtype: object

In [17]:
all_review_clean = all_review.apply(preprocessing)

In [18]:
all_review_clean

0        stuff going moment mj started listening music ...
1        classic war worlds timothy hines entertaining ...
2        film starts manager nicholas bell giving welco...
3        must assumed praised film greatest filmed oper...
4        superbly trashy wondrously unpretentious explo...
                               ...                        
24995    sony pictures classics looking sony got rights...
24996    always felt ms merkerson never gotten role fit...
24997    disappointed movie familiar case read mark fuh...
24998    opening sequence filled black white shots remi...
24999    great horror film people want vomit retching g...
Name: review, Length: 99998, dtype: object

#### CountVectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer(analyzer='word', max_features=5000)

NameError: name 'CountVectorizer' is not defined

In [43]:
all_review_cv = cv.fit_transform(all_review_clean)

In [44]:
all_review_cv.shape

(99998, 5000)

In [46]:
all_review_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Train Test unlabled Train 으로 분리

In [47]:
train_sentences = all_review_cv[:len(train)]
test_setences = all_review_cv[-len(test):]

In [49]:
train_sentences.shape, test_setences.shape

((25000, 5000), (25000, 5000))

In [50]:
train_labels = train['sentiment']
train_labels.shape

(25000,)

#### n-gram 적용 CountVectorizer

In [22]:
ng_cv = CountVectorizer(analyzer='word', ngram_range=(1, 2),max_features=5000)

In [23]:
all_review_ng_cv = ng_cv.fit_transform(all_review_clean)

In [24]:
print("N-gram 행렬:")
print(all_review_ng_cv.toarray())


N-gram 행렬:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Train Test unlabled Train 으로 분리

In [25]:
train_sentences = all_review_ng_cv[:len(train)]
test_setences = all_review_ng_cv[-len(test):]

In [26]:
train_sentences.shape, test_setences.shape

((25000, 5000), (25000, 5000))

In [27]:
train_labels = train['sentiment']
train_labels.shape

(25000,)

#### Model_1
RandomForestClassifier

In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
RFC = RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1)

In [59]:
RFC.fit(train_sentences, train_labels)

In [60]:
prediction = RFC.predict(test_setences)

In [61]:
prediction.shape

(25000,)

In [62]:
prediction[:10]

array([1, 0, 1, 1, 1, 0, 0, 1, 0, 1], dtype=int64)

#### Model_2
VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [28]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [47]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42) # RandomForestClassifier 분류기 생성
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42) # GradientBoostingClassifier 분류기 생성
# svm_clf = SVC(kernel='rbf', probability=True, random_state=42) #Support Vector Machine 분류기 생성

In [48]:
voting_clf = VotingClassifier(estimators=[('rf', rf_clf), ('gb', gb_clf)], voting='soft')
# Voting Classifier 생성

In [49]:
voting_clf.fit(train_sentences, train_labels)

In [50]:
y_pred = voting_clf.predict(test_setences)

In [51]:
prediction = voting_clf.predict(test_setences)

In [52]:
prediction.shape

(25000,)

#### submission

In [53]:

submission =pd.read_csv(os.path.join(DATA, 'sampleSubmission.csv'))

In [54]:
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [55]:
submission['sentiment'] = prediction

In [56]:
submission['sentiment'].value_counts()


sentiment
1    13080
0    11920
Name: count, dtype: int64

In [57]:
import datetime

In [58]:
timestring = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

In [59]:
filename = f'submission-{timestring}.csv'

In [60]:
filename

'submission-2024-02-14-17-45-31.csv'

In [61]:
submission.to_csv(os.path.join(DATA, filename),index = False)