# Анализ тональности текста с помощью нечеткой кластеризации в дистрибутивной семантике

*Программа демонстрирует применение алгоритмов четкой **k-means (KM)** и нечеткой **c-means (FCM)** кластеризации в дистрибутивной семантике для определения тональности высказываний. Рассматриваются анонимные русскоязычные высказывания из социальной сети Twitter (твиттер). В качестве примеров дистрибутивной семантики, используются модели, заранее обученные с помощью алгоритма **word2vec (skip-gram)**.*

В первом эксперименте рассматривается подход, который был описан на площадке для анализа данных Kaggle (Источник: https://www.kaggle.com/c/word2vec-nlp-tutorial).

In [28]:
from gensim.models import Word2Vec

num_features = 300
model = Word2Vec.load("models/300features_40minwords_10context_full")
# Вектора слов
word_vectors = model.syn0
# Список слов
words = model.index2word

print word_vectors.shape

(863L, 300L)


In [29]:
import numpy as np  # Make sure that numpy is imported
import pandas as pd

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
#     featureVec = np.divide(featureVec,nwords)
    if (nwords != 0):
        featureVec = np.divide(featureVec, nwords)
    else:
        featureVec = np.zeros((num_features,), dtype="float32")
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        # Print a status message every 1000th review
        if counter%1000. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
 
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)

        # Increment the counter
        counter = counter + 1.
    return reviewFeatureVecs

def review_to_wordlist(review, remove_stopwords=True):
        review = review.decode("utf-8").split()
#         print review
#         exit(0)
        return review

In [40]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

train = pd.read_csv('data/stemmed/ttk_train_mystem.tsv',
                    header=0,
                    delimiter="\t",
                    quoting=3)
train = train[~train.sentiment.str.contains('neutral')]
test = pd.read_csv('data/stemmed/ttk_test_etalon_mystem.tsv',
                   header=0,
                   delimiter="\t",
                   quoting=3)
test = test[~test.sentiment.str.contains('neutral')]

# print train[:5]
# print test[:5]

clean_train_reviews = []
for review in train['text']:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test['text']:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

Review 0 of 2541
Review 1000 of 2541
Review 2000 of 2541
Creating average feature vecs for test reviews
Review 0 of 1250
Review 1000 of 1250




In [41]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

forest = RandomForestClassifier(n_estimators = 100)

print "Fitting a random forest to labeled training data..."
forest = forest.fit(trainDataVecs, train["sentiment"])

# Test & extract results 
result = forest.predict(testDataVecs)

# Write the test results 
output = pd.DataFrame(data={"sentiment":test["sentiment"], "predicted":result, "text":test["text"]})
output.to_csv("Word2Vec_AverageVectors.csv", index=False, quoting=3)

print classification_report(test["sentiment"], result)

Fitting a random forest to labeled training data...
             precision    recall  f1-score   support

   negative       0.77      0.92      0.84       879
   positive       0.66      0.36      0.46       371

avg / total       0.74      0.75      0.73      1250



## Word2Vec creates clusters of semantically related words, so another possible approach is to exploit the similarity of words within a cluster. 

In [42]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
num_clusters = word_vectors.shape[0] / 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  7.69099998474 seconds.


In [43]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip(words, idx))

In [44]:
# For the first 10 clusters
for cluster in xrange(0,10):
    #
    # Print the cluster number  
    print "\nКластер %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    cluster_words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if(word_centroid_map.values()[i] == cluster):
            cluster_words.append(word_centroid_map.keys()[i])
            print word_centroid_map.keys()[i],
    
    print ''
#     print cluster_words


Кластер 0
повышать цена 

Кластер 1
:) приходиться мама постоянно давно билайновский забывать роутер ни менять @sberbank платный 

Кластер 2
рамка автомобиль 25 беларусь банковский альфа-банк денежный участник подписывать предоставление 

Кластер 3
погашение условие наличный наличные отзыв расчет екатеринбург рассчитать авто 

Кластер 4
друг у тут вчера поздравлять нет тот сегодня работать все 

Кластер 5
филиал яновость разделять сми: оск финансовый намерен рейтинг #бизнес против расширять правительство россельхозбанк: журнал объединять вэб 

Кластер 6
переставать отключать киевстар # 

Кластер 7
плохой использовать ошибка крутой находиться должный ;) @borisnemtsov долго подключаться 

Кластер 8
скачать 

Кластер 9
проверять присылать тело 


In [45]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max(word_centroid_map.values()) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [46]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train["text"].size, num_clusters), dtype="float32")

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros((test["text"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [47]:
# Fit a random forest and extract predictions 
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print "Fitting a random forest to labeled training data..."
forest = forest.fit(train_centroids,train["sentiment"])
result = forest.predict(test_centroids)

# Write the test results 
output = pd.DataFrame(data={"sentiment":test["sentiment"], "predicted":result, "text":test["text"]})
output.to_csv("BagOfCentroids.csv", index=False, quoting=3)

print classification_report(test["sentiment"], result)

Fitting a random forest to labeled training data...
             precision    recall  f1-score   support

   negative       0.78      0.88      0.83       879
   positive       0.59      0.40      0.48       371

avg / total       0.72      0.74      0.72      1250

