In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the labeled training and test data
# Header = 0 indicates that the first line of the file contains column names, 
# delimiter = \t indicates that the fields are seperated by tabs, and 
# quoting = 3 tells python to ignore doubled quotes

train = pd.read_csv(r'C:\Users\HPPC\labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
test = pd.read_csv(r'C:\Users\HPPC\testData.tsv', header = 0, delimiter = '\t', quoting = 3)
unlabel_train = pd.read_csv(r'C:\Users\HPPC\unlabeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)

In [4]:
'train dim:{}, unlabeled train dim:{}, test dim:{}'.format(train.shape, unlabel_train.shape, test.shape)

'train dim:(25000, 3), unlabeled train dim:(50000, 2), test dim:(25000, 2)'

In [5]:
# Import the libraries for data cleaning.

from bs4 import BeautifulSoup
import re
import nltk

In [6]:
def preprocess_wordlist(data, stopwords = False):
    
    # Remove HTML tag
    review = BeautifulSoup(data,'html.parser').get_text()
    
    # Remove non-letters
    review = re.sub('[^a-zA-Z]', ' ', review)
    
    # Convert to lower case
    review = review.lower()
    
    # Tokenize
    word = nltk.word_tokenize(review)
    
    # Optional: Remove stop words (false by default)
    if stopwords:
        stops = set(nltk.corpus.stopwords.words("english"))
        
        words = [w for w in word if not w in stops]
    
    return word

In [7]:
def preprocess_sent(data, stopwords = False):
    
    # Split the paragraph into sentences
    
    #raw = tokenizer.tokenize(data.strip())
    raw = nltk.sent_tokenize(data.strip())
    
    # If the length of the sentence is greater than 0, plug the sentence in the function preprocess_wordlist (clean the sentence)
    sentences = [preprocess_wordlist(sent, stopwords) for sent in raw if len(sent) > 0]
    
    return sentences

In [8]:
sentence = []

# Append labeled reviews first
for review in train['review']:
    sentence += preprocess_sent(review)
    
# Append unlabeled reviews
for review in unlabel_train['review']:
    sentence += preprocess_sent(review)

In [10]:
print(len(sentence))
print()
print(sentence[:2])


795538

[['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again'], ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']]


In [9]:
train['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [11]:
from gensim.models import word2vec

In [12]:
num_features = 250
min_count = 40
num_processor = 4
context = 10
downsampling = 0.001

In [13]:
# Plug in the sentence variable first.

model = word2vec.Word2Vec(sentence, workers = num_processor, 
                         size = num_features, min_count = min_count,
                         window = context, sample = downsampling)

In [14]:
# Unload unneccessary memory once the learning process is done.

model.init_sims(replace = True)

In [15]:
model_name = "250features_40minwords_20context"
model.save(model_name)

In [16]:
model.most_similar("king")

[('prince', 0.6115729808807373),
 ('lion', 0.5922257900238037),
 ('kingdom', 0.5399940013885498),
 ('arthur', 0.5393497943878174),
 ('solomon', 0.5356030464172363),
 ('stephen', 0.5314776301383972),
 ('aladdin', 0.5310251712799072),
 ('kings', 0.5264624357223511),
 ('hong', 0.5124601125717163),
 ('rudolf', 0.5122345685958862)]

In [29]:
model.most_similar("awful")

[('terrible', 0.7910809516906738),
 ('horrible', 0.7510476112365723),
 ('abysmal', 0.7458819150924683),
 ('atrocious', 0.7392771244049072),
 ('dreadful', 0.7102159261703491),
 ('appalling', 0.6896439790725708),
 ('horrendous', 0.6858556270599365),
 ('horrid', 0.6791461706161499),
 ('lousy', 0.6487061977386475),
 ('amateurish', 0.647085964679718)]

In [30]:
model.wv.syn0.shape

(16487, 250)

In [17]:
''' 

The purpose of this function is to combine all the word2vec vector values of each word in each review
if each review is given as input and divide by the total number of words.

Each word can be represented as number of feature dimension space vector. ex) model['flower'] = array([0.1, 0.2, ...]).
(You can think of it as extended xy coordinate.) Therefore, it enables vectorizing each review by 
combining all the words' vector values.

Illustration example:

'I' = [0.1, 0.2, ...]
'have' = [0.2, 0.3, ...]
'a car' = [0.1, 0.2, ...]
'I have a car' = [0.1 + 0.2 + 0.1, 0.2 + 0.3 + 0.2,  ...]


ex) review1 = ['he', 'has', 'a', 'cat']

First word : If the word 'he' is in the word2vec, index the vector values from word2vec model by model['he']
(the dimension of the matrix would be, in this case, (400,) ) and add them to predefined zero matrix.

Second word: Check if there is the word 'has' in the word2vec model and if there is, index the vector values and 
add them on top of the added vector values from the first word.

The rest: Iterate the above steps for the rest of words and lastly, divide by the total number of words. 

Illustration example: 

zero : [0,    0,   0,   ....]
word1: [0.2,  0.3, 0.4, ....]
word2: [0.1,  0.2, 0.3, ....]

word(1~2): [0.3, 0.5, 0.7, ....]

'''

def makeFeatureVec(review, model, num_features):
    
    featureVec = np.zeros((num_features,), dtype = "float32")
    
    # Unique word set
    word_index = set(model.wv.index2word)
    
    # For division we need to count the number of words
    nword = 0
    
    # Iterate words in a review and if the word is in the unique wordset, add the vector values for each word.
    for word in review:
        if word in word_index:
            nword += 1
            featureVec = np.add(featureVec, model[word])
    
    # Divide the sum of vector values by total number of word in a review.
    featureVec = np.divide(featureVec, nword)        
    
    return featureVec

In [18]:
''' 

While iterating over reviews, add the vector sums of each review from the function "makeFeatureVec" to 
the predefined vector whose size is the number of total reviews and the number of features in word2vec.
The working principle is basically same with "makeFeatureVec" but this is a review basis and 
makeFeatureVec is word basis (or each word's vector basis)


return matrix:

            'V1'    'V2'    'V3'     'V4'
review 1    0.1      0.2     0.1     0.5
review 2    0.5      0.4     0.05    0.05

'''

def getAvgFeatureVec(clean_reviews, model, num_features):
    
    # Keep track of the sequence of reviews, create the number "th" variable.
    review_th = 0
    
    # Row: number of total reviews, Column: number of vector spaces (num_features = 250 we set this in Word2Vec step).
    reviewFeatureVecs = np.zeros((len(clean_reviews), num_features), dtype = "float32")
    
    # Iterate over reviews and add the result of makeFeatureVec.
    for review in clean_reviews:
        reviewFeatureVecs[int(review_th)] = makeFeatureVec(review, model, num_features)
        
        # Once the vector values are added, increase the one for the review_th variable.
        review_th += 1
    
    return reviewFeatureVecs

In [19]:
clean_train_reviews = []

# Clean the reviews by preprocessing function with stopwords option "on".
for review in train["review"]:
    clean_train_reviews.append(preprocess_wordlist(review, stopwords = True))

# Apply "getAvgFeatureVec" function.
trainDataAvg = getAvgFeatureVec(clean_train_reviews, model, num_features)
    
    
# Same steps repeats as we did for train_set.    
clean_test_reviews = []

for review in test["review"]:
    clean_test_reviews.append(preprocess_wordlist(review, stopwords = True))

testDataAvg = getAvgFeatureVec(clean_test_reviews, model, num_features)

In [36]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.linear_model import LogisticRegression



In [32]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataAvg, train["sentiment"])


Fitting random forest to training data....


In [40]:
print(forest.score(trainDataAvg,train['sentiment']))

1.0


In [34]:
# Predicting the sentiment values for test data and saving the results in a csv file 
result = forest.predict(testDataAvg)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "output.csv", index=False, quoting=3 )

In [38]:
# LinearSVC
kfold = StratifiedKFold(n_splits=5, random_state = 2018)
sv = LinearSVC(random_state=2018)

param_grid1 = {
    'loss':['squared_hinge'],
    'class_weight':[{1:2}],
    'C': [20],
    'penalty':['l2']
}

gs_sv = GridSearchCV(sv, param_grid = [param_grid1], verbose = 1, cv = kfold, n_jobs = 1, scoring = 'roc_auc' )
gs_sv.fit(trainDataAvg, train['sentiment'])
gs_sv_best = gs_sv.best_estimator_
print(gs_sv.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min finished


{'C': 20, 'class_weight': {1: 2}, 'loss': 'squared_hinge', 'penalty': 'l2'}


In [39]:
print(gs_sv.best_score_)

0.942875936


In [41]:
#logistic
lr = LogisticRegression(random_state = 2018)


lr_param2 = {
    'penalty':['l1'],
    'dual':[False],
    'C':[40],
    'class_weight':['balanced'],
    'solver':['saga']
    
}

lr_CV = GridSearchCV(lr, param_grid = [lr_param2], cv = kfold, scoring = 'roc_auc', n_jobs = 1, verbose = 1)
lr_CV.fit(trainDataAvg,train['sentiment'])
print(lr_CV.best_params_)
logi_best = lr_CV.best_estimator_
print(lr_CV.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.4min finished


{'C': 40, 'class_weight': 'balanced', 'dual': False, 'penalty': 'l1', 'solver': 'saga'}
0.943032064
