# Engineering

In [12]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import re #regular expressions
import nltk.data
from nltk.corpus import stopwords

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

train = pd.read_csv("C:/Users/Lenovo/Desktop/labeledTrainData.tsv", header=0,\
                    delimiter="\t", quoting=3)
test = pd.read_csv("C:/Users/Lenovo/Desktop/testData.tsv",header=0,\
                    delimiter="\t", quoting=3)

[nltk_data] Error loading popular: <urlopen error [WinError 10054]
[nltk_data]     远程主机强迫关闭了一个现有的连接。>


In [13]:
def review_wordlist(review, remove_stopwords=False):# preprocessing
    
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    return(words)

In [14]:
def review_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))
    return sentences

In [15]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set




In [19]:
from gensim.models import word2vec

num_features = 300  
min_word_count = 40 
num_workers = 4     
context = 10        
downsampling = 1e-3 

print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

model.init_sims(replace=True) #高效

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "Model"
model.save(model_name)

2020-11-10 15:12:45,868 : INFO : collecting all words and their counts
2020-11-10 15:12:45,873 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model....


2020-11-10 15:12:46,276 : INFO : PROGRESS: at sentence #10000, processed 224958 words, keeping 17776 word types
2020-11-10 15:12:46,475 : INFO : PROGRESS: at sentence #20000, processed 447771 words, keeping 24856 word types
2020-11-10 15:12:46,604 : INFO : PROGRESS: at sentence #30000, processed 669436 words, keeping 30088 word types
2020-11-10 15:12:46,723 : INFO : PROGRESS: at sentence #40000, processed 896563 words, keeping 34393 word types
2020-11-10 15:12:46,840 : INFO : PROGRESS: at sentence #50000, processed 1116307 words, keeping 37828 word types
2020-11-10 15:12:46,950 : INFO : PROGRESS: at sentence #60000, processed 1334972 words, keeping 40682 word types
2020-11-10 15:12:47,065 : INFO : PROGRESS: at sentence #70000, processed 1558627 words, keeping 43374 word types
2020-11-10 15:12:47,193 : INFO : PROGRESS: at sentence #80000, processed 1782139 words, keeping 45791 word types
2020-11-10 15:12:47,310 : INFO : PROGRESS: at sentence #90000, processed 2000978 words, keeping 4821

2020-11-10 15:13:19,775 : INFO : EPOCH 4 - PROGRESS: at 92.59% examples, 607499 words/s, in_qsize 7, out_qsize 0
2020-11-10 15:13:20,221 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-10 15:13:20,228 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-10 15:13:20,243 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-10 15:13:20,248 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-10 15:13:20,251 : INFO : EPOCH - 4 : training on 5804934 raw words (3961366 effective words) took 6.5s, 607622 effective words/s
2020-11-10 15:13:21,282 : INFO : EPOCH 5 - PROGRESS: at 12.14% examples, 477015 words/s, in_qsize 8, out_qsize 0
2020-11-10 15:13:22,283 : INFO : EPOCH 5 - PROGRESS: at 24.87% examples, 489574 words/s, in_qsize 7, out_qsize 0
2020-11-10 15:13:23,300 : INFO : EPOCH 5 - PROGRESS: at 36.72% examples, 480476 words/s, in_qsize 6, out_qsize 1
2020-11-10 15:13:24,304 : INFO : EPOCH 5 - PROG

In [7]:
def featureVecMethod(words, model, num_features):
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0   
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))      
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [8]:
import time
t1 = time.time()
model = Word2Vec.load('300features_40minwords_10context')
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 24500


  featureVec = np.add(featureVec,model[word])


Review 1000 of 24500
Review 2000 of 24500
Review 3000 of 24500
Review 4000 of 24500
Review 5000 of 24500
Review 6000 of 24500
Review 7000 of 24500
Review 8000 of 24500
Review 9000 of 24500
Review 10000 of 24500
Review 11000 of 24500
Review 12000 of 24500
Review 13000 of 24500
Review 14000 of 24500
Review 15000 of 24500
Review 16000 of 24500
Review 17000 of 24500
Review 18000 of 24500
Review 19000 of 24500
Review 20000 of 24500
Review 21000 of 24500
Review 22000 of 24500
Review 23000 of 24500
Review 24000 of 24500


In [9]:
test_reviews = []
test_reviews.append(review_wordlist(text,remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(test_reviews, model, num_features)

Review 0 of 22000


  featureVec = np.add(featureVec,model[word])


Review 1000 of 22000
Review 2000 of 22000
Review 3000 of 22000
Review 4000 of 22000
Review 5000 of 22000
Review 6000 of 22000
Review 7000 of 22000
Review 8000 of 22000
Review 9000 of 22000
Review 10000 of 22000
Review 11000 of 22000
Review 12000 of 22000
Review 13000 of 22000
Review 14000 of 22000
Review 15000 of 22000
Review 16000 of 22000
Review 17000 of 22000
Review 18000 of 22000
Review 19000 of 22000
Review 20000 of 22000
Review 21000 of 22000


In [18]:
from sklearn.ensemble import RandomForestClassifier
import pickle

forest = RandomForestClassifier(n_estimators = 100)

print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])
#result = forest.predict(testDataVecs)

with open('model.pickle','wb') as f:
    pickle.dump(forest,f)

Fitting random forest to training data....
