# Engineering

In [53]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import re #regular expressions
import nltk.data
from nltk.corpus import stopwords
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

df = pd.read_csv("C:/Users/Lenovo/Desktop/Tweets.csv")
data = df[['id','sentiment','review']]
data

Unnamed: 0,id,sentiment,review
0,570306133677760513,0,@VirginAmerica What @dhepburn said.
1,570301130888122368,1,@VirginAmerica plus you've added commercials t...
2,570301083672813571,0,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,-1,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,-1,@VirginAmerica and it's a really big bad thing...
...,...,...,...
14635,569587686496825344,1,@AmericanAir thank you we got on a different f...
14636,569587371693355008,-1,@AmericanAir leaving over 20 minutes Late Flig...
14637,569587242672398336,0,@AmericanAir Please bring American Airlines to...
14638,569587188687634433,-1,"@AmericanAir you have my money, you change my ..."


In [54]:
train = data.iloc[0:int(len(data)*0.8)]             #divide into 8:2
test = data.iloc[int(len(data)*0.8):int(len(data))]

In [58]:
def review_wordlist(review, remove_stopwords=False):# preprocessing
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    return(words)

def review_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))
    return sentences

In [60]:
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)

Parsing sentences from training set


In [61]:
from gensim.models import word2vec

num_features = 300  
min_word_count = 40 
num_workers = 4     
context = 10        
downsampling = 1e-3 

print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

model.init_sims(replace=True) #高效
model.save("C:/Users/Lenovo/Desktop/model")

2020-11-11 17:54:45,748 : INFO : collecting all words and their counts
2020-11-11 17:54:45,750 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-11 17:54:45,783 : INFO : PROGRESS: at sentence #10000, processed 90036 words, keeping 7471 word types
2020-11-11 17:54:45,809 : INFO : PROGRESS: at sentence #20000, processed 176419 words, keeping 11406 word types
2020-11-11 17:54:45,819 : INFO : collected 12324 word types from a corpus of 208395 raw words and 23504 sentences
2020-11-11 17:54:45,822 : INFO : Loading a fresh vocabulary
2020-11-11 17:54:45,830 : INFO : effective_min_count=40 retains 619 unique words (5% of original 12324, drops 11705)
2020-11-11 17:54:45,831 : INFO : effective_min_count=40 leaves 168481 word corpus (80% of original 208395, drops 39914)
2020-11-11 17:54:45,836 : INFO : deleting the raw counts dictionary of 12324 items
2020-11-11 17:54:45,837 : INFO : sample=0.001 downsamples 76 most-common words
2020-11-11 17:54:45,838 : INFO : do

Training model....


2020-11-11 17:54:45,962 : INFO : training model with 4 workers on 619 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2020-11-11 17:54:46,076 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-11 17:54:46,079 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-11 17:54:46,083 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-11 17:54:46,087 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-11-11 17:54:46,088 : INFO : EPOCH - 1 : training on 208395 raw words (109174 effective words) took 0.1s, 1114839 effective words/s
2020-11-11 17:54:46,183 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-11-11 17:54:46,187 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-11-11 17:54:46,189 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-11-11 17:54:46,191 : INFO : worker thread finished; awaiting finish of 0 

# Train RandomForestClassifier

In [62]:
def featureVecMethod(words, model, num_features):
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0   
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))      
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [63]:
#model = word2vec.load("C:/Users/Lenovo/Desktop/model")

train_reviews = []
for review in train['review']:
    train_reviews.append(review_wordlist(review, remove_stopwords=True))
trainDataVecs = getAvgFeatureVecs(train_reviews, model, num_features)

Review 0 of 11712
Review 1000 of 11712
Review 2000 of 11712


  featureVec = np.add(featureVec,model[word])


Review 3000 of 11712
Review 4000 of 11712
Review 5000 of 11712
Review 6000 of 11712
Review 7000 of 11712
Review 8000 of 11712
Review 9000 of 11712
Review 10000 of 11712
Review 11000 of 11712


In [83]:
test_reviews = []

for review in test['review']:
    test_reviews.append(review_wordlist(review, remove_stopwords=True))
testDataVecs = getAvgFeatureVecs(test_reviews, model, num_features)

Review 0 of 2928
Review 1000 of 2928
Review 2000 of 2928


  featureVec = np.add(featureVec,model[word])


In [96]:
from sklearn.ensemble import RandomForestClassifier
import pickle

RFmodel = RandomForestClassifier(n_estimators = 100)
RFmodel = forest.fit(trainDataVecs,train["sentiment"])
#with open('C:/Users/Lenovo/Desktop/RFmodel.pickle', 'wb') as file:
#    pickle.dump(forest, file)

result = RFmodel.predict(testDataVecs)
answer = []
for i in range(0, len(test)):
    answer.append(test.iloc[i]['sentiment'])
acc = sum(answer == result)/len(result)
acc

0.7435109289617486