In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import re 
from nltk.corpus import stopwords



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/week4/sentiment

/content/drive/MyDrive/week4/sentiment


In [4]:
%ls

300features_40minwords_10context  sentiment-analysis-using-word2vec.ipynb
labeledTrainData.tsv              testData.tsv
output.csv


In [5]:
# 데이터불러오기
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

test = pd.read_csv("testData.tsv",header=0, delimiter="\t", quoting=3)

In [6]:
#전처리
def review_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [7]:
import nltk.data
nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [8]:
def review_sentences(review, tokenizer, remove_stopwords=False):
    # nltk tokenizer 사용
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences


In [9]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)
    

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
# logging 모델 import 
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [11]:
# 모델 파라미터 지정
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)


2021-03-27 04:39:10,977 : INFO : 'pattern' package not found; tag filters are not available for English
2021-03-27 04:39:10,990 : INFO : collecting all words and their counts
2021-03-27 04:39:10,992 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-27 04:39:11,055 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2021-03-27 04:39:11,114 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2021-03-27 04:39:11,180 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


Training model....


2021-03-27 04:39:11,252 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2021-03-27 04:39:11,311 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2021-03-27 04:39:11,369 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2021-03-27 04:39:11,433 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2021-03-27 04:39:11,491 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2021-03-27 04:39:11,555 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2021-03-27 04:39:11,615 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2021-03-27 04:39:11,680 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2021-03-27 04:39:11,739 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

In [12]:
#test
model.wv.doesnt_match("man woman dog child kitchen".split())



  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [13]:
model.wv.doesnt_match("france england germany berlin".split())


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'berlin'

In [14]:
# This will print the most similar words present in the model
model.wv.most_similar("man")


[('woman', 0.6499963402748108),
 ('boy', 0.6230103373527527),
 ('doctor', 0.6039432883262634),
 ('soldier', 0.6017917990684509),
 ('guy', 0.5844689011573792),
 ('cop', 0.5791202783584595),
 ('lady', 0.5780496597290039),
 ('person', 0.5558451414108276),
 ('journalist', 0.5466338396072388),
 ('scientist', 0.5436903238296509)]

In [15]:
model.wv.most_similar("awful")


[('horrible', 0.8205475807189941),
 ('terrible', 0.820220410823822),
 ('atrocious', 0.7120427489280701),
 ('dreadful', 0.7100231051445007),
 ('laughable', 0.7010295391082764),
 ('appalling', 0.6746677160263062),
 ('bad', 0.6657297611236572),
 ('pathetic', 0.6495277881622314),
 ('lame', 0.646144449710846),
 ('stupid', 0.6453275680541992)]

In [16]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape

  


(8306, 300)

In [17]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec



In [18]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs


In [19]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 25000


  del sys.path[0]


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [20]:
#test set의 average feature vactors 계산
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 25000


  del sys.path[0]


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000


In [21]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])
    

Fitting random forest to training data....


In [22]:
# Predicting the sentiment values for test data and saving the results in a csv file 
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
#output.to_csv( "output.csv", index=False, quoting=3 )

In [23]:
output

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
...,...,...
24995,"""2155_10""",1
24996,"""59_10""",1
24997,"""2531_1""",1
24998,"""7772_8""",1


In [24]:
output.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.49748
std,0.500004
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0
