In [4]:
# import rpy2.robjects as robjects
# from rpy2.robjects.packages import importr
# from rpy2.robjects import pandas2ri
# pandas2ri.activate()

import gensim
import nltk
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Preprocessing

In [46]:
## read previously generated data with labels

# df = pd.read_csv("~/imdb/data/df_tagged.csv") 
df = pd.read_csv("/home/ruser/imdb/data/df_tagged_real_train.csv")
df['idx'] = range(0, df.shape[0])

In [48]:
df.shape

(11354, 8)

In [47]:
nltk.download('punkt')
nltk.download('wordnet')

# Transform to lower case
corpus = df["text"].str.lower()

## Removing punctuation

tokenizer = RegexpTokenizer(r'\w+')
sentences = [tokenizer.tokenize(doc) for doc in corpus]

# Stemming and Lemmatisation

## stemming
porter_stemmer = PorterStemmer()

## lemmatisation
wordnet_lemmatizer = WordNetLemmatizer()


sentences = [[wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) for token in sentence] for sentence in sentences]

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Tags

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
docs = []
docs = [gensim.models.doc2vec.TaggedDocument(words = sentences[texts], tags=[df['rating'].iloc[idx]]) 
        for texts, idx in enumerate(df['idx'].values)]

In [None]:
docs[0]

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = []
docs = [gensim.models.doc2vec.TaggedDocument(words = sentences[text], tags=[df['rating'].iloc[idx]]) 
        for idx, text in enumerate(df['idx'].values)]

# Train

In [13]:
len(docs)
# docs[1][0]

NameError: name 'docs' is not defined

In [None]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO 

In [None]:


vector_size = 150
window_size = 10
min_count = 5
sampling_threshold = 1e-5 # high-frequency words cutting
negative_size = 10
train_epoch = 1
dm = 1 # 0 = dbow; 1 = dmpv
dbow_words = 1 # train or not word embeeddings
worker_count = 4 # number of parallel processes, no reproducibility 

model = gensim.models.Doc2Vec(size = vector_size, window = window_size, min_count = min_count, 
                              sample = sampling_threshold, workers = worker_count, 
                              dm = dm, dbow_words = 1,
                              negative = negative_size, 
                              dm_concat = 1, 
                              iter = train_epoch, 
                              alpha = 0.025, min_alpha = 0.0025)

model.build_vocab(docs)
model.train(docs, 
            total_examples = len(docs), 
            epochs = model.iter)



# model.train(it, 
#             total_examples = len(sentences), 
#             epochs = model.iter)


## Trained model

In [5]:
# model = gensim.models.Doc2Vec.load("~/imdb/models/doc2vec_tagged.bin")
model = gensim.models.Doc2Vec.load("/home/ubuntu/imdb/models/doc2vec_real")

In [6]:
# to get most similar document with similarity scores using document-index
similar_doc = model.docvecs.most_similar(2) 
print(similar_doc)
# to get most similar document with similarity scores using document- name

[(8.6, 0.991557240486145), (9.3, 0.9906759262084961), (7.6, 0.9900305271148682), (8.5, 0.9891781806945801), (8.0, 0.9891082644462585), (8.7, 0.9886906743049622), (8.9, 0.9880008697509766), (8.3, 0.9879062175750732), (8.1, 0.9765941500663757), (8.2, 0.8884684443473816)]


In [45]:
# model.docvecs.indexed_doctags('9.3')
model.docvecs

In [None]:
# sims = model.docvecs.most_similar('_horror_3')
# print(sims)

# Predicting Rating
## Test

In [55]:
model.docvecs.count

12

In [51]:
len(model.wv.syn0)

14389

In [57]:
model.wv.syn0.shape

(14389, 150)

In [58]:
X = model.docvecs
# X = model.wv.syn0

In [12]:
# model.wv.syn0.shape
# model.wv.index2word
# model.docvecs.doctag_syn0.shape
# model.docvecs.offset2doctag
# model.docvecs.max_rawint
# model.docvecs.doctags

(14389, 150)

In [60]:
from sklearn import linear_model
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# y = np.asarray([model.docvecs.offset2doctag[i].split('_')[2] for i in range(len(X))])
# y = np.asarray([model.docvecs.offset2doctag[i] for i in range(len(X))])
y = np.asarray([model.docvecs.offset2doctag[i] for i in range(len(X))])
y = y.astype(int)

lin = linear_model.LinearRegression(n_jobs = -1)
lin.fit(X, y)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, lin.predict(X)))

## tags were generated artificially without counting for real movie rating (not yeat combined with scrapped data)

Variance score: 1.00


In [68]:
# print("Mean squared error: %.2f"
#       % mean_squared_error(y, lin.predict(X)))

lin.predict(X)[0:12]

array([7.999999, 7.000001, 7.999999, 7.999999, 7.999999, 8.      ,
       7.999999, 7.999999, 7.      , 8.      , 8.      , 8.999999],
      dtype=float32)

## Test

In [None]:
# a = "Amazing!!!"
# infered = model.infer_vector(a).reshape(1,-1)
# preds = lin.predict(infered)

In [None]:
df_test = pd.read_csv("/home/ruser/imdb/data/df_tagged_real_test.csv")
df_test['idx'] = range(0, df_test.shape[0])

corpus = df_test["text"].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
sentences = [tokenizer.tokenize(doc) for doc in corpus]
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
sentences = [[wordnet_lemmatizer.lemmatize(porter_stemmer.stem(token)) for token in sentence] for sentence in sentences]


docs_test = []
docs_test = [gensim.models.doc2vec.TaggedDocument(words = sentences[text], tags=[df_test['rating'].iloc[idx]]) 
        for idx, text in enumerate(df_test['idx'].values)]

In [None]:
preds = pd.DataFrame(columns = ['infered', 'predicted', 'rating'])

for i in range(0, len(docs_test)):
# i = 0

    d = {'infered': model.infer_vector(docs_test[i][0]).reshape(1,-1).tolist(), 
         'predicted': lin.predict(model.infer_vector(docs_test[i][0]).reshape(1,-1)).tolist(), 
         'rating': docs_test[i][1]}
    # pd.DataFrame(data = d)

    preds = pd.concat([preds, pd.DataFrame(data = d)], ignore_index = True)

In [None]:
pred.head()

In [None]:
preds.to_csv('/home/ruser/imdb/data/test_scrapped.csv')