In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# imports
import numpy as np 
import matplotlib.pyplot as plt
import pickle

# Application of Doc2Vec to twitter dataset epfl

In [3]:
import os
os.chdir(r'/home/ilaria/Scrivania/data_ML/twitter_datasets_epfl/short')


# Doc2Vec by itself (build its vocab with Word2Vec)

Building the Doc2Vec model following the example: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

In [4]:
import gensim
import os
import collections
import smart_open
import random

import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


In [5]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [6]:
train_corpus = list(read_corpus('all_short_processed.txt'))

In [7]:
#setting hyperparameters
vector_size = 25
window_size = 10
min_count = 5
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv


model = gensim.models.doc2vec.Doc2Vec(size=vector_size, min_count=min_count, sample=sampling_threshold, workers=cores, hs=0,dm=dm,negative=negative_size, dbow_words=1, dm_concat=1, iter=train_epoch)

In [8]:
model.build_vocab(train_corpus)


One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function.

In [9]:
# infearing the positive vectors 
fname = 'train_pos_processed.txt'
num_lines = sum(1 for line in open(fname))
X_pos = np.zeros((num_lines, vector_size))

# define a function that create a list of lists of words: we need to infear to doc2vec model a list of separate word
# to obtain the corresponding vectors from the entire twit

def read_words(words_file):
    row = []
    for line in open(fname, 'r', encoding="utf-8-sig"):
        word = [w for w in line.split()]
        row.append(word)
        
    return row
    
    
content_pos = read_words(fname)

for i in range(len(content_pos)):
    X_pos[i,:] = model.infer_vector(content_pos[i])

X_pos = np.array(X_pos)

In [10]:
fname = 'train_neg_processed.txt'
num_lines = sum(1 for line in open(fname))
X_neg = np.zeros((num_lines, vector_size))

content_neg = read_words(fname)

for i in range(len(content_neg)):
    X_neg[i,:] = model.infer_vector(content_neg[i])

X_neg = np.array(X_neg)

In [11]:
y_pos = np.ones(X_pos.shape[0])
y_neg = -np.ones(X_neg.shape[0])

In [12]:
print(X_pos.shape)
print(X_neg.shape)

(90233, 25)
(91088, 25)


In [13]:
X_pos_neg = np.concatenate([X_pos, X_neg])
y_pos_neg = np.concatenate([y_pos, y_neg])

In [14]:
print(X_pos_neg.shape)
print(y_pos_neg.shape)
y_pos_neg

(181321, 25)
(181321,)


array([ 1.,  1.,  1., ..., -1., -1., -1.])

In [15]:
#ind = np.arange(X_pos_neg.shape[0])
#ind_shuf = np.random.permutation(ind)

#X_pos_neg = X_pos_neg[ind_shuf, :]
#y_pos_neg = y_pos_neg[ind_shuf]

# SVM

In [16]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_pos_neg, y_pos_neg) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Submission

In [17]:
# Building the X matrix
fname = 'test_data_no_id_processed.txt'
num_lines = sum(1 for line in open(fname))
X_test = np.zeros((num_lines, vector_size))

content = read_words(fname)

for i in range(len(content)):
    X_test[i,:] = model.infer_vector(content[i])


In [18]:
y_pred = clf.predict(X_test)


In [19]:
X_test.shape

(10000, 25)

In [20]:
os.chdir(r'/home/ilaria/Scrivania/Machine_Learning/Project_2/COMMON')

In [22]:
from create_csv_submission import create_csv_submission
import time
import datetime
i = datetime.datetime.now()

name = "sub_" + time.strftime("%d_%m_%Y") +  "_%sh_%smin" % (i.hour, i.minute)
#name = "vrf"
ids_test = range(1, len(y_pred)+1)
create_csv_submission(ids_test, y_pred, name)