# Setup

In [1]:
# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# other
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import pickle
import random
import os

# submission
from create_csv_submission import create_csv_submission
import time
import datetime



# Prepare input data

In [None]:
# change path to read the long (full) tweet collection
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/twitter_datasets_epfl/full/')

In [None]:
# build new class for handling tweet sentences
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with open(source, 'r', encoding="ISO-8859-1") as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            print(source)
            with open(source, 'r', encoding="ISO-8859-1") as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [None]:
# build labeled sentences
sources = {'train_neg_processed.txt':'TRAIN_NEG', 'train_pos_processed.txt':'TRAIN_POS', 'test_data_no_id_processed.txt':'TEST'}
sentences = LabeledLineSentence(sources)


# Build a Doc2Vec model
Building the Vocabulary Table: Doc2Vec requires us to build the vocabulary table. Model hyper-parameters:
- `min_count`: ignore all words with total frequency lower than this.
- `window`: the maximum distance between the current and predicted word within a sentence. Word2Vec uses a skip-gram model, and this is simply the window size of the skip-gram model.
- `size`: dimensionality of the feature vectors in output. 
- `sample`: threshold for configuring which higher-frequency words are randomly downsampled
- `workers`: use this many worker threads to train the model 

In [None]:
# initialize a Doc2Vec model
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

# read the labeled sentences
model.build_vocab(sentences.to_array())

# Training Doc2Vec model 
Build word and tweet vector representations

In [None]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=20)

# Inspecting the Model

In [None]:
model.most_similar('good')

In [None]:
model.docvecs['TRAIN_POS_1'].shape

# Saving and Loading Models

In [None]:
# adapt path
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\result\doc2vec_short')

In [None]:
# save
model.save('./imdb.d2v')

In [None]:
# load
model = Doc2Vec.load('./imdb.d2v')

# Training tweet vectors

In [None]:
# array of training vector for 500,000 positive and 500,000 negative tweets 
N_tweet_train = 500000
size_embedding = 100
train_arrays = np.zeros((N_tweet_train, size_embedding))
train_labels = np.zeros(N_tweet_train)

for i in range(N_tweet_train):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[N_tweet_train + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[N_tweet_train + i] = -1

# Testing tweet vectors

In [None]:
# array of testing vector for 10,000 test tweets 
N_tweet_test = 10000
test_arrays = np.zeros((N_tweet_test, size_embedding))
test_labels = np.zeros(N_tweet_test)

for i in range(N_tweet_test):
    prefix_test = 'TEST_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test]

# Fit SVM classifier

In [None]:
# hyperparameters
C = 1
gamma = 2

# initialize a SVM
classifier = SVC(C = C, gamma = gamma, kernel = 'rbf')

# fit 
classifier.fit(train_arrays, train_labels)

In [None]:
#change path
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\result\doc2vec_short')

# save the classifier
ifSave = False
if ifSave:
    with open('classifier.pkl', 'wb') as fid:
        pickle.dump(classifier, fid) 
        
# load a classifier
ifLoad = False
if ifLoad:
    with open('classifier.pickle', "rb") as fid:
        classifier = pickle.load(fid)

# Predict test tweet labels

In [None]:
# predict labels
y_pred = classifier.predict(test_arrays)

# Submission

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/submissions/')

In [None]:
# output file name
i = datetime.datetime.now()
name = "sub_" + time.strftime("%d_%m_%Y") +  "_%sh_%smin" % (i.hour, i.minute)
ids_test = range(1, test_arrays.shape[0]+1)

# write submission file
create_csv_submission(ids_test, y_pred, name)