In [4]:
from IPython.display import clear_output
'''
get kaggle dataset
'''
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets list
!kaggle competitions download -c word2vec-nlp-tutorial
clear_output()

In [2]:
!unzip -o ./word2vec-nlp-tutorial.zip
clear_output()

In [3]:
!unzip -o ./labeledTrainData.tsv.zip
!unzip -o ./testData.tsv.zip
!unzip -o ./unlabeledTrainData.tsv.zip
clear_output()

In [2]:
import torch
import torchvision.transforms as transforms
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset
from torchvision.utils import save_image

from PIL import Image

import os
import re
import warnings
import csv

import bs4 as bs
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import random

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize, pos_tag
word_net_lemmatizer = WordNetLemmatizer()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import gensim
from gensim.models import word2vec


In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
clear_output()
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [7]:
sentence = 'Go to school right now'
tokens = nltk.word_tokenize(sentence)
print(tokens)
print(pos_tag(tokens))

['Go', 'to', 'school', 'right', 'now']
[('Go', 'VB'), ('to', 'TO'), ('school', 'NN'), ('right', 'RB'), ('now', 'RB')]


In [8]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [9]:
def review_to_wordlist(review, remove_stopwords=False, lemmalization=False):
    
    # remove HTML tag
    review_text = bs.BeautifulSoup(review).get_text()

    # make non-English become space
    review_text = re.sub("[^a-zA-Z]", " ", review_text)

    words = review_text.lower().split()

    """
    lemmalization
    """
    if lemmalization:
        tagged_sentences = pos_tag(words)
        lemma_senetence = []
        for tag in tagged_sentences:
            pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemma_senetence.append(word_net_lemmatizer.lemmatize(tag[0], pos=pos))
        
        words = lemma_senetence
    # for word in meaningful_words:
    #     word = word_net_lemmatizer.lemmatize(word, 'v')

    # remove stopword
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return words

In [10]:
def review_to_sentences(review, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())

    sentences = []

    for raw_sentence in raw_sentences:

        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords, lemmalization=True))

    return sentences

In [11]:
labeled_df = pd.read_csv('./labeledTrainData.tsv', sep='\t', header=0)
unlabeled_df = pd.read_csv('./unlabeledTrainData.tsv', sep='\t', header=0, on_bad_lines='skip')
test_df = pd.read_csv('./testData.tsv', sep='\t', header=0)
# print(labeled_df.iloc[1][2])
words = review_to_wordlist(labeled_df.iloc[1][2])
print(labeled_df.head(3))
print(unlabeled_df.head(3))
print(words)

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
        id                                             review
0   9999_0  Watching Time Chasers, it obvious that it was ...
1  45057_0  I saw this film about 20 years ago and remembe...
2  15561_0  Minor Spoilers<br /><br />In New York, Joan Ba...
['the', 'classic', 'war', 'of', 'the', 'worlds', 'by', 'timothy', 'hines', 'is', 'a', 'very', 'entertaining', 'film', 'that', 'obviously', 'goes', 'to', 'great', 'effort', 'and', 'lengths', 'to', 'faithfully', 'recreate', 'h', 'g', 'wells', 'classic', 'book', 'mr', 'hines', 'succeeds', 'in', 'doing', 'so', 'i', 'and', 'those', 'who', 'watched', 'his', 'film', 'with', 'me', 'appreciated', 'the', 'fact', 'that', 'it', 'was', 'not', 'the', 'standard', 'predictabl

In [13]:
sentences = []

for review in labeled_df['review']:
    sentences += review_to_sentences(review)

for review in unlabeled_df['review']:
    sentences += review_to_sentences(review)

for review in test_df['review']:
    sentences += review_to_sentences(review)

print(sentences[:3])

[['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'start', 'listen', 'to', 'his', 'music', 'watch', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watch', 'the', 'wiz', 'and', 'watch', 'moonwalker', 'again'], ['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'think', 'be', 'really', 'cool', 'in', 'the', 'eighty', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'be', 'guilty', 'or', 'innocent'], ['moonwalker', 'be', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'go', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'be', 'originally', 'release']]


In [14]:
W2V_NUM_FEATURES = 512   # Word vector dimensionality
W2V_MIN_WORD_COUNT = 60   # Minimum word count
W2V_NUM_WORKERS = 4      # Number of threads to run in parallel
W2V_CONTEXT = 10          # Context window size
W2V_DOWNSAMPLING = 1e-3   # Downsample setting for frequent words

In [15]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

model = word2vec.Word2Vec(sentences, workers=W2V_NUM_WORKERS, \
        vector_size=W2V_NUM_FEATURES, min_count = W2V_MIN_WORD_COUNT, \
        window = W2V_CONTEXT, sg = 1, sample = W2V_DOWNSAMPLING)

model.init_sims(replace=True)
model.save('model202203202347')
clear_output()

In [16]:
model = gensim.models.Word2Vec.load('./model202203202347')
clear_output()
# model.train(more_sentences)

In [17]:
print(f'{model.wv.doesnt_match("good great awesome bad".split())}\n') 

print(f'{model.wv.most_similar("great")}\n')
print(f'model.wv.vectors.shape = {model.wv.vectors.shape}\n')
print(f'{len(model.wv.index_to_key)}')

bad

[('wonderful', 0.5646790862083435), ('fantastic', 0.54557865858078), ('terrific', 0.5346675515174866), ('excellent', 0.521806538105011), ('fine', 0.4908643662929535), ('superb', 0.4869849681854248), ('brilliant', 0.480425626039505), ('good', 0.46944496035575867), ('fabulous', 0.46233803033828735), ('outstanding', 0.45519980788230896)]

model.wv.vectors.shape = (12625, 512)

12625


In [18]:
def wordlist_to_vector(words, model):

    wordVecList = []
    wordSet = set(model.wv.index_to_key)

    for word in words:
        if word in wordSet:
            wordVecList.append(model.wv[word])

    if len(wordVecList) > 0:
        return np.mean(wordVecList, axis=0)
    else:
        raise Exception('len(wordVecList) = 0')
        return np.zeros((W2V_NUM_FEATURES,), dtype='float32')

In [19]:
def reviews_to_vectors(reviews, model):
    cleanWordLists = []
    for review in reviews:
        cleanWordLists.append(review_to_wordlist(review, remove_stopwords=True, lemmalization=True))
    
    vectors = []
    vectorCount = 0
    for cleanWordList in cleanWordLists:
        vectors.append(wordlist_to_vector(cleanWordList, model))

    return vectors

In [20]:
vec = wordlist_to_vector(sentences[0], model)

train_df = labeled_df.drop(labels=['id'], axis=1)
print(train_df.columns)
labeled_x = train_df.drop(labels=['sentiment'], axis=1)
labeled_y = train_df['sentiment']
labeled_y.to_numpy()

vectors = reviews_to_vectors(labeled_x['review'], model)
vectors = np.stack(vectors, axis=0)

Index(['sentiment', 'review'], dtype='object')


In [21]:
train_x, valid_x, train_y, valid_y = train_test_split(vectors, labeled_y.to_numpy(), test_size=0.3, random_state=12345)

all_x = vectors
all_y = labeled_y.to_numpy()
print()

randomForest = RandomForestClassifier(n_estimators=100)
xgboostModel = XGBClassifier(n_estimators=100, learning_rate=0.3, use_label_encoder=False, eval_metric='logloss')

print(f'Random Forest K-fold: {cross_val_score(randomForest, all_x, all_y, cv=10, scoring="accuracy").mean()}')
print(f'XGBoost K-fold: {cross_val_score(xgboostModel, all_x, all_y, cv=10, scoring="accuracy").mean()}')


Random Forest K-fold: 0.83736
XGBoost K-fold: 0.8620000000000001


In [22]:
randomForest.fit(all_x, all_y)
xgboostModel.fit(all_x, all_y)

classifierModel = xgboostModel

In [23]:
test_df = pd.read_csv('./testData.tsv', sep='\t', header=0)
test_x = reviews_to_vectors(test_df['review'], model)
test_x = np.stack(test_x, axis=0)

sample_df = pd.read_csv('./sampleSubmission.csv')
sample_df['sentiment'] = np.squeeze(randomForest.predict(test_x))
sample_df.to_csv('./sampleSubmission.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)