In [1]:
# coding=utf-8
import os
import re
import codecs
import gensim
import logging
import numpy as np
import matplotlib.pyplot as plt

from src.morpheme_model import model
from operator import itemgetter
from collections import defaultdict
from sklearn.manifold import TSNE

def getWordVecs(words, model):
    vecs = []
    for word in words:
        word = word.replace('\n', '')
        try:
            if model.modelWord_from_word.get(word) != None:
                vecs.append(model.model[model.modelWord_from_word[word]].reshape((1,300)))
        except KeyError:
            continue
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype='float') #TSNE expects float type values

def getNewWordVecs(words, model, pref_file_path, suff_file_path):
    vecs = []
    for word in words:
        word = word.replace('\n', '')
        try:
            vec = model.recalculate_vect(word, pref_file_path, suff_file_path, 0.1)
            if vec is not None:
                vecs.append(vec.reshape((1,300)))
        except KeyError:
            continue
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype='float') #TSNE expects float type values
    
def load_model(file_name):
    # TODO: it's for current implementation
    dir_name = os.path.dirname(os.path.realpath('__file__'))
    name = os.path.join(*[dir_name, '..', file_name])
    the_model = model(name)
    return the_model

def save_vectors(file_name, vecs):
    with codecs.open(file_name, 'w', encoding='utf-8') as outfile:
        for vec in vecs:
            outfile.write(str(vec).decode('utf-8'))
            outfile.write('\n')
            
if __name__ == "__main__":
    # ------------ web --------------------
    #model_size = 353608
    #model_file_name = 'web.model.bin'
    
    # ------------ news ------------------
    #model_size = 124590
    #model_file_name = 'news.model.bin'

    # ---------- ruscorpora -------------
    #model_size = 184973
    #model_file_name = 'ruscorpora.model.bin'

    # ---------- ruwikiruscorpora -------------
    model_size = 392339
    model_file_name = 'ruwikiruscorpora.model.bin'
    
    #load word2vec model from file
    print "Load word2vec model..."
    model = load_model(model_file_name)
    print "Done."
    
    dir_name = os.path.dirname(os.path.realpath('__file__'))
    pref_file_path = os.path.join(*[dir_name, 'dicts', 'prefixes.txt'])
    suff_file_path = os.path.join(*[dir_name, 'dicts', 'suffixes.txt'])
    finTerms_file_path = os.path.join(*[dir_name, 'test_data', 'finTerms.txt'])
    medTerms_file_path = os.path.join(*[dir_name, 'test_data', 'med200Terms.txt'])
    avtoTerms_file_path = os.path.join(*[dir_name, 'test_data', 'avto200Terms.txt'])
    
    with codecs.open(finTerms_file_path, 'r', encoding='utf-8') as infile:
        fin_words = infile.readlines()
    
    with codecs.open(medTerms_file_path, 'r', encoding='utf-8') as infile:
        med_words = infile.readlines()
    
    with codecs.open(avtoTerms_file_path, 'r', encoding='utf-8') as infile:
        avto_words = infile.readlines()
    
    # WORD2VEC model
    #save word2vec vectors for finance, medicine and automotive terms
    fin_vecs = getWordVecs(fin_words, model)
    save_vectors('vects/fin_vects.txt', fin_vecs)
    
    med_vecs = getWordVecs(med_words, model)
    save_vectors('vects/med_vects.txt', med_vecs)

    avto_vecs = getWordVecs(avto_words, model)
    save_vectors('vects/avto_vects.txt', avto_vecs)
                
    fin_vecs = np.nan_to_num(fin_vecs)
    med_vecs = np.nan_to_num(med_vecs)
    avto_vecs = np.nan_to_num(avto_vecs)
    
    ts = TSNE(2)
    reduced_vecs = ts.fit_transform(np.concatenate((fin_vecs, med_vecs, avto_vecs)))
    
    
    # NEW MODEL
    
    #save new model vectors for finance, medicine and automotive terms
    new_fin_vecs = getNewWordVecs(fin_words, model, pref_file_path, suff_file_path)
    save_vectors('vects/new_model_fin_vects.txt', new_fin_vecs) 
    
    new_med_vecs = getNewWordVecs(med_words, model, pref_file_path, suff_file_path)
    save_vectors('vects/new_model_med_vects.txt', new_med_vecs) 

    new_avto_vecs = getNewWordVecs(avto_words, model, pref_file_path, suff_file_path)
    save_vectors('vects/new_model_avto_vects.txt', new_avto_vecs)
     
    new_fin_vecs = np.nan_to_num(new_fin_vecs)
    new_med_vecs = np.nan_to_num(new_med_vecs)
    new_avto_vecs = np.nan_to_num(new_avto_vecs)
    
    ts = TSNE(2)
    new_reduced_vecs = ts.fit_transform(np.concatenate((new_fin_vecs, new_med_vecs, new_avto_vecs)))

Load word2vec model...
Done.


In [2]:
import itertools

import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
from collections import defaultdict
from sklearn.manifold import TSNE
from sklearn import mixture

from plotly import version 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
from plotly.offline import init_notebook_mode 
from plotly.graph_objs import Scatter, Figure, Layout
import plotly.graph_objs as go

init_notebook_mode(True)

def plot_results(X, means, covariances, color):
    trace = go.Scatter(x=X[:, 0], y=X[:, 1], 
                           mode='markers',
                           marker=dict(size=10,
                                       color=color,
                                       line=dict(
                                            width = 2
                                            )
                                        )
                      )
    GLOB_DATA.append(trace)
    for i, (mean, covar) in enumerate(zip(
            means, covariances)):
        # Plot an ellipse to show the Gaussian component
        v, w = linalg.eigh(covar)
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        a =  v[1]
        b =  v[0]
        x_origin = mean[0]
        y_origin = mean[1]
        x_ = [ ]
        y_ = [ ]
    
        for t in range(0,361,10):
            x = a*(math.cos(math.radians(t))) + x_origin
            x_.append(x)
            y = b*(math.sin(math.radians(t))) + y_origin
            y_.append(y)
    
        elle = go.Scatter(x=x_ , y=y_, mode='lines',
                          showlegend=False,
                          line=dict(color=color,
                                   width=2))
        #GLOB_DATA.append(elle)

        
if __name__ == "__main__":
    GLOB_DATA = []
   
    new_X_fin = np.array([[0 for x in range(2)] for y in range(len(new_fin_vecs))]) 
    new_X_med = np.array([[0 for x in range(2)] for y in range(len(new_med_vecs))]) 
    new_X_avto = np.array([[0 for x in range(2)] for y in range(len(new_avto_vecs))]) 

    i_fin = 0
    i_med = 0
    i_avto = 0
    
    print len(new_reduced_vecs)
    for i in range(len(new_reduced_vecs)):      
        if i < len(new_fin_vecs):
            new_X_fin[i_fin][0] = new_reduced_vecs[i][0]
            new_X_fin[i_fin][1] = new_reduced_vecs[i][1]
            i_fin += 1
        elif i >= len(new_fin_vecs) and i < (len(new_fin_vecs) + len(new_med_vecs)):
            new_X_med[i_med][0] = new_reduced_vecs[i][0]
            new_X_med[i_med][1] = new_reduced_vecs[i][1]
            i_med += 1
        else:
            new_X_avto[i_avto][0] = new_reduced_vecs[i][0]
            new_X_avto[i_avto][1] = new_reduced_vecs[i][1]
            i_avto += 1

    # Fit a Gaussian mixture with EM using 1 components

    gmm_fin = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(new_X_fin)
    plot_results(new_X_fin, gmm_fin.means_, gmm_fin.covariances_, 'rgb(0, 200, 0)')

    gmm_med = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(new_X_med)
    plot_results(new_X_med, gmm_med.means_, gmm_med.covariances_, 'rgb(180, 0, 0)')


    gmm_avto = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(new_X_avto)
    plot_results(new_X_avto, gmm_avto.means_, gmm_avto.covariances_, 'rgb(0, 0, 200)')

    layout = go.Layout(title='Morpheme model', showlegend=False,
                           xaxis=dict(title='X', zeroline=False, showgrid=False),
                           yaxis=dict(title='Y', zeroline=False, showgrid=False),)
    new_fig = go.Figure(data=GLOB_DATA, layout=layout)
    iplot(new_fig)

    print "Determinant of the cov matrix. Fin topic."
    print map(lambda x: linalg.det(x), gmm_fin.covariances_)
    print "Determinant of the cov matrix. Med topic."
    print map(lambda x: linalg.det(x), gmm_med.covariances_)
    print "Determinant of the cov matrix. Avto topic."
    print map(lambda x: linalg.det(x), gmm_avto.covariances_)
    
    # Fit a Dirichlet process Gaussian mixture using five components
    #dpgmm = mixture.BayesianGaussianMixture(n_components=3,
    #                                        covariance_type='full').fit(X)
    #plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
    #             'Bayesian Gaussian Mixture with a Dirichlet process prior')

    
    
    GLOB_DATA = []

    X_fin = np.array([[0 for x in range(2)] for y in range(len(fin_vecs))]) 
    X_med = np.array([[0 for x in range(2)] for y in range(len(med_vecs))]) 
    X_avto = np.array([[0 for x in range(2)] for y in range(len(avto_vecs))]) 

    i_fin = 0
    i_med = 0
    i_avto = 0

    for i in range(len(reduced_vecs)):      
        if i < len(fin_vecs):
            X_fin[i_fin][0] = reduced_vecs[i][0]
            X_fin[i_fin][1] = reduced_vecs[i][1]
            i_fin += 1
        elif i >= len(fin_vecs) and i < (len(fin_vecs) + len(med_vecs)):
            X_med[i_med][0] = reduced_vecs[i][0]
            X_med[i_med][1] = reduced_vecs[i][1]
            i_med += 1
        else:
            X_avto[i_avto][0] = reduced_vecs[i][0]
            X_avto[i_avto][1] = reduced_vecs[i][1]
            i_avto += 1

    # Fit a Gaussian mixture with EM using 1 components

    gmm_fin = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(X_fin)
    plot_results(X_fin, gmm_fin.means_, gmm_fin.covariances_, 'rgb(0, 200, 0)')

    gmm_med = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(X_med)
    plot_results(X_med, gmm_med.means_, gmm_med.covariances_, 'rgb(180, 0, 0)')


    gmm_avto = mixture.GaussianMixture(n_components=1, covariance_type='full').fit(X_avto)
    plot_results(X_avto, gmm_avto.means_, gmm_avto.covariances_, 'rgb(0, 0, 200)')

    layout = go.Layout(title='Word2Vec model', showlegend=False,
                           xaxis=dict(title='X', zeroline=False, showgrid=False),
                           yaxis=dict(title='Y', zeroline=False, showgrid=False),)
    fig = go.Figure(data=GLOB_DATA, layout=layout)
    iplot(fig)

    print "Determinant of the cov matrix. Fin topic."
    print map(lambda x: linalg.det(x), gmm_fin.covariances_)
    print "Determinant of the cov matrix. Med topic."
    print map(lambda x: linalg.det(x), gmm_med.covariances_)
    print "Determinant of the cov matrix. Avto topic."
    print map(lambda x: linalg.det(x), gmm_avto.covariances_)

345


Determinant of the cov matrix. Fin topic.
[386320.8099252447]
Determinant of the cov matrix. Med topic.
[860356.1686456898]
Determinant of the cov matrix. Avto topic.
[414574.62069551385]


Determinant of the cov matrix. Fin topic.
[387458.26076677017]
Determinant of the cov matrix. Med topic.
[664700.4196670583]
Determinant of the cov matrix. Avto topic.
[615093.7939170174]


In [13]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(True)

precision = []
precision2 = []

with open('precision.csv', 'r') as fin:
    for line in fin:
        precision.append(line.split()[3])
    # for
# with

with open('result_plus_one.csv') as fin:
    for line in fin:
        precision2.append(line.split(',')[2])
    # for
# with

trace = go.Scatter(y=precision[:10000], name=u"признаки 1")
trace2 = go.Scatter(y=precision2[:10000], name=u"признаки 2")

layout = go.Layout(
    title='Plot Title',
    xaxis=dict(
        title=u'количество слов',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title=u'точность',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=[trace, trace2], layout=layout)

iplot(fig, show_link=False)