In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# imports
import numpy as np 
import matplotlib.pyplot as plt
import pickle

In [2]:
import os
os.chdir(r'/home/ilaria/Scrivania/Machine_Learning/Project_2/Project2/data/twitter_datasets_epfl/unique/short')


# Doc2Vec by itself (build its vocab with Word2Vec)

Building the Doc2Vec model following the example: https://github.com/bariscimen/doc2vec-sentiment/blob/master/run.ipynb

In [3]:
from random import shuffle

# gensim modules
import gensim
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


In [4]:
class TaggedLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield TaggedDocument(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(TaggedDocument(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % item_no]))
        return self.sentences
    
    def shuffle_sentences(self):
        return shuffle(self.sentences)

In [5]:
sources = {'test_data_unique_no_id.txt':'TEST', 'train_neg_unique.txt':'TRAIN_NEG', 'train_pos_unique.txt':'TRAIN_POS', 'all_short.txt':'TRAIN_UNS'}

sentences = TaggedLineSentence(sources)

In [6]:
#setting hyperparameters
vector_size = 25
window_size = 10
min_count = 5
sampling_threshold = 1e-5
negative_size = 5
#train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv

model = Doc2Vec(size=vector_size, min_count=min_count, sample=sampling_threshold, workers=cores, hs=0,dm=dm,negative=negative_size, dbow_words=1, dm_concat=1)#, iter=train_epoch)

In [7]:
model.build_vocab(sentences.to_array())

In [8]:
for epoch in range(10):
    sentences.shuffle_sentences()
    model.train(sentences.sentences, total_examples=model.corpus_count, epochs=model.iter)
    print('epoch : ', epoch+1)

epoch :  1
epoch :  2
epoch :  3
epoch :  4
epoch :  5
epoch :  6
epoch :  7
epoch :  8
epoch :  9
epoch :  10


In [46]:
fname_n = 'train_neg_unique.txt'
fname_p = 'train_pos_unique.txt'
fname = 'all_short.txt'
num_lines_n = sum(1 for line in open(fname_n))
num_lines_p = sum(1 for line in open(fname_p))
num_lines = num_lines_n+num_lines_p

train_arrays = np.zeros((num_lines, 25))
train_labels = np.zeros(num_lines)

for i in range(num_lines_p):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_labels[i] = 1

    
for i in range(num_lines_n):
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i+num_lines_p] = model.docvecs[prefix_train_neg]
    train_labels[i+num_lines_p] = 0

In [3]:
fname_n = 'train_neg_unique.txt'
fname_p = 'train_pos_unique.txt'
fname = 'all_short.txt'
num_lines_n = sum(1 for line in open(fname_n))
num_lines_p = sum(1 for line in open(fname_p))
num_lines = num_lines_n+num_lines_p

train_arrays = np.zeros((num_lines, 25))
train_labels = np.zeros(num_lines)
print(num_lines)

181321


In [47]:
train_arrays

array([[ 0.07024649,  0.04121226,  0.09486401, ..., -0.366135  ,
         0.22673237,  0.293228  ],
       [ 0.20097168,  0.09412365,  0.16070902, ..., -0.02224808,
        -0.00505873,  0.30946195],
       [-0.04262437, -0.21278168,  0.26415163, ...,  0.09859272,
        -0.15508561,  0.2746335 ],
       ..., 
       [ 0.36948472, -0.26978615,  0.15946236, ..., -0.79108208,
        -0.94520468,  0.6064015 ],
       [-0.04055488,  0.10100127,  0.04305083, ...,  0.00284125,
        -0.05670581,  0.03889457],
       [-0.11072072,  0.22389984,  0.03755175, ..., -0.14977589,
        -0.03372359, -0.07821357]])

# Build the classifier with SVM

In [48]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(train_arrays, train_labels) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Submission 

In [52]:
# Reading the ids from the file 
ids_test = np.zeros(10000)

with open('test_data_unique.txt', 'r', encoding="utf-8-sig") as f:
    i = 0
    for line in f:
        id_ = line.lstrip().split(',')[0]
        ids_test[i] = id_
        i = i+1
        
test_arrays = np.zeros((i, 25))
        
# Build the X matrix to compute the prediction with the testing
for j in range(i):
    prefix_test = 'TEST_' + str(j)
    test_arrays[j] = model.docvecs[prefix_test]


In [53]:
y_pred = clf.predict(test_arrays)
print(test_arrays.shape)
print(y_pred.shape)

(10000, 25)
(10000,)


In [57]:
os.chdir(r'/home/ilaria/Scrivania/Machine_Learning/Project_2/COMMON')

In [58]:
from create_csv_submission import create_csv_submission

name = 'sub1'
create_csv_submission(ids_test, y_pred, name)