# Setup

In [1]:
# classifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# other
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy as np
import pickle
import random
import os
import datetime
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

# submission
from create_csv_submission import create_csv_submission
import time
import datetime

# add path
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2'
sys.path.insert(0,my_path + r'/code/COMMON')



# Prepare input data

In [2]:
# change path to read the short tweet collection
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/twitter_datasets_epfl/short/')

In [3]:
# build new class for handling tweet sentences
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with open(source, 'r', encoding="ISO-8859-1") as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            print(source)
            with open(source, 'r', encoding="ISO-8859-1") as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [4]:
# build labeled sentences
sources = {'train_neg_processed.txt':'TRAIN_NEG', 'train_pos_processed.txt':'TRAIN_POS', 'test_data_no_id_processed.txt':'TEST'}
sentences = LabeledLineSentence(sources)


# Build a Doc2Vec model
Building the Vocabulary Table: Doc2Vec requires us to build the vocabulary table. Model hyper-parameters:
- `min_count`: ignore all words with total frequency lower than this.
- `window`: the maximum distance between the current and predicted word within a sentence. Word2Vec uses a skip-gram model, and this is simply the window size of the skip-gram model.
- `size`: dimensionality of the feature vectors in output. 
- `sample`: threshold for configuring which higher-frequency words are randomly downsampled
- `workers`: use this many worker threads to train the model 

In [5]:
# initialize a Doc2Vec model
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

# read the labeled sentences
model.build_vocab(sentences.to_array())

train_neg_processed.txt
train_pos_processed.txt
test_data_no_id_processed.txt


# Training Doc2Vec model 
Build word and tweet vector representations

In [6]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=20)

32071824

# Inspecting the Model

In [7]:
model.most_similar('good')

[('great', 0.768185019493103),
 ('nice', 0.7028723955154419),
 ('bad', 0.6618689298629761),
 ('<user>', 0.661242663860321),
 ('well', 0.6532421708106995),
 ('that', 0.6383326649665833),
 ('like', 0.635858952999115),
 ('!', 0.6327580213546753),
 ('goood', 0.6291547417640686),
 ('today', 0.6217421293258667)]

In [8]:
model.docvecs['TRAIN_POS_1'].shape

(100,)

# Saving and Loading Models

In [None]:
# adapt path
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\result\doc2vec_short')

In [9]:
# save
model.save('./imdb.d2v')

In [10]:
# load
model = Doc2Vec.load('./imdb.d2v')

# Training tweet vectors

In [15]:
# array of training vector for 100,000 positive and 100,000 negative tweets 
N_tweet_train = 100000
size_embedding = 100
train_arrays = np.zeros((N_tweet_train, size_embedding))
train_labels = np.zeros(N_tweet_train)

for i in range(N_tweet_train):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[N_tweet_train + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[N_tweet_train + i] = -1

# Hyperparameters optimization SVM

## Prepare data

In [21]:
# shuffle of training matrix
ind = np.arange(train_arrays.shape[0])
ind_shuf = np.random.permutation(ind)
X_pos_neg = train_arrays[ind_shuf, :]
y_pos_neg = train_labels[ind_shuf]

# number of training samples
N_samples_train = -1

# cut samples and targets
X_cut = X_pos_neg[:N_samples_train,:]
y_cut = y_pos_neg[:N_samples_train]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Grid search and cross validation 

In [None]:
# data
X = X_cut
y = y_cut

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# range for hyperparameters
C_range = np.logspace(-1, 3, 10)
gamma_range = np.logspace(-6, 2, 10)

# Set the parameters by cross-validation
tuned_parameters = [{'gamma': gamma_range, 'C': C_range}]
                    
# define grid search CV
start_time = datetime.datetime.now()
clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=5, scoring= 'accuracy')

# fit for every parameters combinations in grid search CV
clf.fit(X_train, y_train)

# get time
end_time = datetime.datetime.now()
exection_time = (end_time - start_time).total_seconds()
print("Grid Search: execution time={t:.3f} seconds".format(t=exection_time))

## Display results

In [30]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
    
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

In [None]:
class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

scores = clf.cv_results_['mean_test_score'].reshape(len(C_range), len(gamma_range))

plt.figure(figsize=(12,10))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot, norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma',fontsize=20)
plt.ylabel('C', fontsize=20)
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy', fontsize=23)
plt.savefig("Boh")
plt.show()