In [None]:
import sys
sys.path.append("../src/")

import pandas as pd
import numpy as np
from torchvision import transforms
from textDataset import *

from models import BoWClassifier, Tfid

import torch
import torch.nn as nn
import torch.optim as optim

from trainner import TrainnerNLP

from gensim.models import word2vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import time
from scipy.spatial.distance import cdist

import multiprocessing

# We import seaborn to make nice plots.
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

%matplotlib inline  

In [None]:
cpu_count = 2*multiprocessing.cpu_count()-1
print('Number of CPUs: {}'.format(cpu_count))

In [None]:
path2data = '../data/sentiment_analysis/'

text = {
        'train': TextDataset(path2data + 'train/', extension='.tsv', sep='\t', is_train = True),
}    

In [None]:
#col = 'headline_text'
#col = 'comment_text'
col = 'Phrase'
text['train'].process_data(col = col, remove_stopw = True, remove_tags=True)

In [None]:
text['train'].data.head()

In [None]:
sentences = { 'train': text['train'].data[col + '_data'].values}

In [None]:
X = sentences['train']
#classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
classes = ['Sentiment']
y = np.concatenate([np.array(i) for i in text['train'].data[classes].values])

In [None]:
class_count=pd.value_counts(y, sort=False)

In [None]:
Index = np.arange(1,len(class_count)+1)
classes = ['negative','neutral','somewhat negative','somewhat positive','positive']
plt.figure(figsize=(15,5))
plt.bar(Index,class_count,color = 'blue')
plt.xticks(Index,classes,rotation=45)
plt.ylabel('word_count')
plt.xlabel('word')
plt.title('Count of Moods')
plt.bar(Index, class_count)
for a,b in zip(Index, class_count):
    plt.text(a, b, str(b) ,color='green', fontweight='bold')

### Split data in training and validation set

In [None]:
test_size = 0.2
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=42)

In [None]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = cpu_count  # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


# train model
model = word2vec.Word2Vec(X_train, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
model.init_sims(replace=True)

In [None]:
#https://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html

In [None]:
def makeFeatureVec(words, model, num_features):
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    # Divide the result by the number of words to get the average
    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    # Loop through the reviews
    for review in reviews:
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [None]:
f_matrix_train = getAvgFeatureVecs(X_train, model, num_features)
f_matrix_valid = getAvgFeatureVecs(X_valid, model, num_features)

## Words visualization using PCA

In [None]:
def plot_pca(f_matrix_train, y_train, nb_clusters = 5):

    palette = np.array(sns.color_palette("hls", nb_clusters))

    pca = PCA(n_components=2)
    result = pca.fit_transform(f_matrix_train)
    
    plt.figure(figsize=(4, 4))     
    plt.scatter(result[:, 0], result[:, 1], lw=0, s=40,
                    c=palette[y_train.astype(np.int)])

    plt.title('Visualization PCA')
    plt.axis('off')
    plt.axis('tight')
    plt.show()

In [None]:
plot_pca(f_matrix_train, y_train)

## Words visualization using t-SNE

In [None]:
def plot_tSNE(f_matrix_train, y_train, nb_clusters=5):
    
    #https://www.oreilly.com/learning/an-illustrated-introduction-to-the-t-sne-algorithm
    palette = np.array(sns.color_palette("hls", nb_clusters))
    labels = []
    tokens = []

    for sample in range(0, len(f_matrix_train)):
        tokens.append(f_matrix_train[sample])
        labels.append(y_train[sample])
    
    new_values = TSNE(random_state=23).fit_transform(tokens)
        
    plt.figure(figsize=(8, 8)) 
    plt.scatter(new_values[:,0],new_values[:,1], lw=0, s=40,
                c=palette[y_train.astype(np.int)])

    plt.axis('off')
    plt.axis('tight')
    plt.show()

In [None]:
plot_tSNE(f_matrix_train, y_train)

# Unsupervised Learning approach

### Determining number of clusters - Elbow Rule
###### When K increases, the centroids are closer to the clusters centroids.

In [None]:
# k means determine k
distortions = []
max_nb_cluster = 10
for k in range(1,max_nb_cluster):
    print('Training K-means models for {} cluster/s...'.format(k))
    kmeanModel = KMeans(n_clusters=k).fit(f_matrix_train)
    kmeanModel.fit(f_matrix_train)
    distortions.append(sum(np.min(cdist(f_matrix_train, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / f_matrix_train.shape[0])

In [None]:
# Plot the elbow
plt.plot(range(1,max_nb_cluster), distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method')
plt.show()

In [None]:
start = time.time() # Start time
num_clusters = 5
# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( f_matrix_train )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print ("Time taken for K Means clustering: ", elapsed, "seconds.")

In [None]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

In [None]:
# For the first 5 clusters
for cluster in range(0,num_clusters):
    
    print ("\nCluster %d" % cluster)
    words = []
    for key, value in word_centroid_map.items():
        if( value == cluster ):
            words.append(key)
    print (words)

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=num_clusters)
print('Trainning K-Nearest Neighbors...')

knn.fit(f_matrix_train, y_train)
valid_score = knn.score(f_matrix_valid, y_valid)

print('K-Nearest Neighbors - Score: {}'.format(valid_score))

# Supervised Learning approach