In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Importing librares

In [None]:
import sys
sys.path.append("../src/")

In [None]:
import pandas as pd
import multiprocessing
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
import collections
from tqdm import tqdm
from gensim.models import word2vec

from textDataset import *
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import calinski_harabaz_score
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud,STOPWORDS
from scipy.stats import norm

In [None]:
cpu_count = 2*multiprocessing.cpu_count()-1
print('Number of CPUs: {}'.format(cpu_count))

# Text preprocessing

In [None]:
#path2data = '../../'
path2data = '../data/news_headlines/'

text = {
        'train': TextDataset(path2data, extension='.csv', sep=',', is_train = True),
}    

In [None]:
col = 'headline_text'
text['train'].process_data(col = col, remove_stopw = True, remove_tags=False, lemmalize = True, stem = True)

In [None]:
print('Dataset lenght: {}'.format(len(text['train'].data)))

In [None]:
text['train'].data.publish_date = pd.to_datetime(text['train'].data.publish_date.astype(str),format='%Y-%m-%d')

In [None]:
text['train'].data.head()

In [None]:
text['train'].data.publish_date.describe()

In [None]:
text['train'].data.publish_date.dt.year.value_counts().plot(kind = 'barh')

# Extracting Features

In [None]:
features_extractor = 'word2Vec'
#features_extractor = 'bow'

In [None]:
def buildWordVector(model, tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model[word].reshape((1, size))
            count += 1.
        except KeyError: 
            continue
    return vec

In [None]:
# Reference: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-3-more-fun-with-word-vectors

def makeFeatureVec(words, model, num_features):
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    # Divide the result by the number of words to get the average
    if nwords == 0:
        nwords = 1
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [None]:
# Reference: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-3-more-fun-with-word-vectors

def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    counter = 0
    # Loop through the reviews
    for review in reviews:
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [None]:
def extract_features(X_train, features_extractor = 'word2Vec', max_gram = 2, average_features = True):

    print('Features extractor: {}'.format(features_extractor))

    if features_extractor == 'bow':

        print('Counting ocorrences of words. Ngram-range: {}...'.format(str((1,max_gram))))
        vectorizer = CountVectorizer(analyzer = "word",
                                     max_features = 300, ngram_range=(1,max_gram)) 
        f_train = vectorizer.fit_transform([" ".join(x) for x in X_train])

        #downscale weights for words using tf–idf: “Term Frequency times Inverse Document Frequency”.
        print('Words downscaling using TF-IDF...')

        tfidf_transformer = TfidfTransformer()
        f_train = tfidf_transformer.fit_transform(f_train)
        
        model = vectorizer

    else:

        # Set values for various parameters
        num_features = 300    # Word vector dimensionality                      
        min_word_count = 50   # Minimum word count                        
        num_workers = cpu_count  # Number of threads to run in parallel
        context = 10          # Context window size                                                                                    
        downsampling = 1e-3   # Downsample setting for frequent words (default value)


        print('Creating Word2Vec Model...')
        model = word2vec.Word2Vec(workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = context, sample = downsampling)


        model.build_vocab(X_train)
        model.train(X_train, \
                    total_examples=model.corpus_count, epochs=model.epochs)
        
        
        if average_features:
            f_train = getAvgFeatureVecs( X_train, model, num_features )
        else:
            f_train = np.concatenate([buildWordVector(model, z, num_features) for z in map(lambda x: x, X_train)])
        
        scaler = StandardScaler()
        f_train = scaler.fit_transform(f_train)
                
    return f_train, model

### Find number of clusters

In [None]:
def find_nb_clusters(f_train, kmax = 25, plot=True, path2save = ''):
    wcss = []
    cali = []
    
    print('Max number of clusters for MiniBatchKMeans: {}'.format(kmax))
    for i in tqdm(range(2, kmax)):
        kmeans = MiniBatchKMeans(n_clusters = i, init = 'k-means++',
                                 n_init=1,init_size= int(1e3),
                                 random_state = 42,batch_size=int(1e3))
        kmeans.fit(f_train)
        wcss.append(kmeans.inertia_)
        ypred = kmeans.predict(f_train)
        cali.append(calinski_harabaz_score(f_train,ypred))
        
    if plot:

        fig = plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        plt.plot(range(2, kmax), cali, '-o')
        plt.title('Calinski Harabaz Score')
        plt.xlabel('Number of clusters')
        plt.ylabel('Score')
        plt.grid()

        plt.subplot(1, 2, 2)
        plt.plot(range(2, kmax), wcss, '-o')
        plt.title('The Elbow Method (WCSS)')
        plt.xlabel('Number of clusters')
        plt.ylabel('Score')
        plt.grid()
        plt.show()
        fig.savefig(path2save)
    return wcss, cali

# Quantitative Cluster Analysis

In [None]:
def calculate_cluster_variance(f_train, n_clusters=4):

    kmeans = MiniBatchKMeans(n_clusters = n_clusters, init = 'k-means++',
                                 n_init=1,init_size= int(1e3),
                                 random_state = 42,batch_size=int(1e3))
    kmeans.fit(f_train)
        
    ypred = kmeans.predict(f_train)

    #np.unique(ypred,return_counts=True)
    tmp =np.concatenate([f_train,ypred.reshape(-1,1)],axis=1)
    var=[]
    for cluster in range(n_clusters): 
        var.append(np.var(tmp[tmp[:,-1]==cluster]))

    return var

# Qualitative Cluster Analysis

In [None]:
def get_corpus(data):
    corpus = []
    for i in data:
        for j in i:
            corpus.append(j)
    return corpus    

In [None]:
def get_wordCloud(corpus):
    
    wordCloud = WordCloud(background_color='white',
                              stopwords=STOPWORDS,
                              width=3000,
                              height=2500,
                              max_words=200,
                              random_state=42
                         ).generate(str(corpus))
    return wordCloud

In [None]:
def count_most_frequent_words(corpus, n_print = 5):

    word_counter = collections.Counter(procTextCorpus)
    for word, count in word_counter.most_common(n_print):
        print(word, ": ", count)
    return word_counter

In [None]:
directory_lst = ['./figs/elbow/', './figs/qualitative_analysis/',
                 './figs/quantitative_analysis/']
for directory in directory_lst:    
    if not os.path.exists(directory):
        os.makedirs(directory)

## Cluster analysis for each year

In [None]:
min_year = text['train'].data.publish_date.dt.year.min()
max_year = text['train'].data.publish_date.dt.year.max()

for y in np.arange(min_year, max_year):

    X_train = text['train'].data[text['train'].data.publish_date.dt.year == y]
    
    print('Dataset length: {}. Year: {}'.format(len(X_train), y))
    f_train, _ = extract_features(X_train[col + '_data'].values, average_features = True)
    path2save = './figs/elbow/' + features_extractor + 'ElbowRule'+'_'+str(y)+'.png'
    find_nb_clusters(f_train, kmax=20, path2save = path2save)
    
        
    n_clusters= 5
    
    ## Quantitative Analysis
    var = calculate_cluster_variance(f_train, n_clusters = n_clusters)
    
    fig = plt.figure(figsize=(8,4))
    plt.scatter(range(0, n_clusters), var)
    plt.title('Variance within cluster')
    plt.xlabel('Cluster')
    plt.ticklabel_format(style='plain',axis='x',useOffset=False)
    plt.ylabel('Variance')
    plt.grid()
    plt.show()
    fig.savefig('./figs/quantitative_analysis/'+ features_extractor +'Variance_' + str(n_clusters) +'C.png')

    ## Qualitative Analysis
    procTextCorpus = get_corpus(X_train[col + '_data'])
    procWordCloud = get_wordCloud(procTextCorpus)
    n_print = 5
    word_counter = count_most_frequent_words(procTextCorpus, n_print = n_print)
                          
    fig = plt.figure(figsize=(8, 8))
    plt.imshow(procWordCloud)
    plt.axis('off')
    plt.show()
    fig.savefig('./figs/qualitative_analysis/'+ features_extractor + 'word_clouds.png')
    
    
    fig = plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)

    sns.distplot(X_train['nb_words'],hist=True, kde=False, bins=10, fit=norm)
    plt.title("Distribution of words in headline news")
    plt.xlabel('Number of words in headline news')

    ax = fig.add_subplot(1,2,2)
    lst = word_counter.most_common(n_print)
    df = pd.DataFrame(lst, columns = ['Word', 'Count'])
    plt.title('Most frequent words')
    df.plot(kind="barh",x='Word',y='Count', ax=ax)

    fig.savefig('./figs/qualitative_analysis/'+ features_extractor + 'word_counter.png')

## Cluster analysis for whole dataset

In [None]:
X_train = text['train'].data[col + '_data']

In [None]:
X_train[0]

In [None]:
f_train, W2Vmodel = extract_features(X_train, average_features = True)

In [None]:
print('Shape of train features:: {}'.format(f_train.shape))

### Find number of clusters

In [None]:
path2save = './figs/elbow/'+ features_extractor +'ElbowRule.png'
find_nb_clusters(f_train, kmax=20, path2save=path2save)

# Visualizing clusters

In [None]:
n_clusters = 4

In [None]:
kmeans = MiniBatchKMeans(n_clusters = n_clusters, init = 'k-means++',
                         n_init=1,init_size= int(1e3),
                         random_state = 42,batch_size=int(1e3))
kmeans.fit(f_train)
ypred = kmeans.predict(f_train)

# Words analysis in each clusters

In [None]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( W2Vmodel.wv.index2word, ypred ))

# For the first 5 clusters
for cluster in range(0,4):
    
    print (f"\n Cluster {cluster}")
    words = []
    for key, value in word_centroid_map.items():
        if( value == cluster ):
            words.append(key)

    print(f'{words[:10]}')

### Number of words in each cluster

In [None]:
words_map_series = pd.Series(word_centroid_map, index=word_centroid_map.keys())

In [None]:
fig  = plt.figure(figsize=(15,5))
plt.title('Number of words in each cluster', fontsize = 16)
words_map_series.value_counts().plot(kind = 'barh')
plt.xlabel('Words count', fontsize = 16)
plt.ylabel('Clusters', fontsize = 16)
fig.savefig('./figs/quantitative_analysis/' + 'words_count_each_cluster_' + str(n_clusters) + 'C.png')

In [None]:
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA


def plot_pca(f_train, y_train, nb_clusters = 4):

    palette = np.array(sns.color_palette("hls", nb_clusters))

    pca = PCA(n_components=3)
    result = pca.fit_transform(f_train)
    

    fig = plt.figure(figsize=(16,8))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(result[:, 0], result[:, 1],result[:, 2],
               s=40, c=palette[ypred.astype(np.int)])

    plt.title('Visualization PCA')
    plt.grid()
    plt.show()

In [None]:
plot_pca(f_train, ypred)

## Words visualization using t-SNE

In [None]:
import bokeh.plotting as bp
from sklearn.manifold import TSNE
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook



def plot_tSNE(model,n_samples = 5000):

    
    #https://www.oreilly.com/learning/an-illustrated-introduction-to-the-t-sne-algorithm

    output_notebook()
    fig = bp.figure(plot_width=700, plot_height=600, title="A map of " + str(n_samples) + " word vectors",
        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
        x_axis_type=None, y_axis_type=None, min_border=1)


    word_vectors = [model[w] for w in model.wv.vocab.keys()][:n_samples]
    #word_vectors = [token for token in f_matrix_train][0:n_samples]
    word_centroid_map = dict(zip( model.wv.index2word, ypred ))



    tsne_model = TSNE(n_components=2, verbose=1, random_state=23)
    tsne_w2v = tsne_model.fit_transform(word_vectors)

    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
    tsne_df['words'] = [k for k in model.wv.vocab.keys()][:n_samples]

    fig.scatter(x='x', y='y', source=tsne_df)
    hover = fig.select(dict(type=HoverTool))
    hover.tooltips={"word": "@words"}
    show(fig)

In [None]:
plot_tSNE(W2Vmodel)

# Quantitative Cluster Analysis

In [None]:
var = calculate_cluster_variance(f_train)

In [None]:
plt.figure(figsize=(8,4))
plt.scatter(range(0, 4), var)
plt.title('Variance within cluster')
plt.xlabel('Cluster')
plt.ticklabel_format(style='plain',axis='x',useOffset=False)
plt.ylabel('Variance')
plt.grid()
plt.show()
plt.savefig('./Variance_4C.png')

In [None]:
procTextCorpus = get_corpus(text['train'].data['headline_text_data'])

### Word Cloud Visualization

In [None]:
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
                rc={"lines.linewidth": 2.5})

In [None]:
procWordCloud = get_wordCloud(procTextCorpus)

In [None]:
fig = plt.figure(figsize=(8, 8))
plt.imshow(procWordCloud)
plt.axis('off')
plt.show()
fig.savefig('./figs/all_data_word_clouds.png')

### Most frequent words

In [None]:
word_counter = count_most_frequent_words(procTextCorpus)

In [None]:
fig = plt.figure(figsize=(20,5))
plt.subplot(1,2,1)

sns.distplot(text['train'].data['nb_words'],hist=True, kde=False, bins=10, fit=norm)
plt.title("Distribution of words in headline news")
plt.xlabel('Number of words in headline news')

ax = fig.add_subplot(1,2,2)
lst = word_counter.most_common(n_print)
df = pd.DataFrame(lst, columns = ['Word', 'Count'])
plt.title('Most frequent words')
df.plot(kind="barh",x='Word',y='Count', ax=ax)
fig.savefig('./figs/qualtitative_analysis/' + 'words_counter.png')